]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: Be smarter with null_entry handling in ip6_pol_route_lookup
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114                                            struct in6_addr *daddr,
115                                            struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = ATOMIC_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         rcu_read_lock();
384         from = rcu_dereference(rt->from);
385         rcu_assign_pointer(rt->from, NULL);
386         fib6_info_release(from);
387         rcu_read_unlock();
388 }
389
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391                            int how)
392 {
393         struct rt6_info *rt = (struct rt6_info *)dst;
394         struct inet6_dev *idev = rt->rt6i_idev;
395         struct net_device *loopback_dev =
396                 dev_net(dev)->loopback_dev;
397
398         if (idev && idev->dev != loopback_dev) {
399                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400                 if (loopback_idev) {
401                         rt->rt6i_idev = loopback_idev;
402                         in6_dev_put(idev);
403                 }
404         }
405 }
406
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409         if (rt->rt6i_flags & RTF_EXPIRES)
410                 return time_after(jiffies, rt->dst.expires);
411         else
412                 return false;
413 }
414
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417         struct fib6_info *from;
418
419         from = rcu_dereference(rt->from);
420
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (from) {
425                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426                         fib6_check_expired(from);
427         }
428         return false;
429 }
430
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432                                         struct fib6_info *match,
433                                         struct flowi6 *fl6, int oif,
434                                         const struct sk_buff *skb,
435                                         int strict)
436 {
437         struct fib6_info *sibling, *next_sibling;
438
439         /* We might have already computed the hash for ICMPv6 errors. In such
440          * case it will always be non-zero. Otherwise now is the time to do it.
441          */
442         if (!fl6->mp_hash)
443                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444
445         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
446                 return match;
447
448         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449                                  fib6_siblings) {
450                 const struct fib6_nh *nh = &sibling->fib6_nh;
451                 int nh_upper_bound;
452
453                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454                 if (fl6->mp_hash > nh_upper_bound)
455                         continue;
456                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
457                         break;
458                 match = sibling;
459                 break;
460         }
461
462         return match;
463 }
464
465 /*
466  *      Route lookup. rcu_read_lock() should be held.
467  */
468
469 static inline struct fib6_info *rt6_device_match(struct net *net,
470                                                  struct fib6_info *rt,
471                                                     const struct in6_addr *saddr,
472                                                     int oif,
473                                                     int flags)
474 {
475         struct fib6_info *sprt;
476
477         if (!oif && ipv6_addr_any(saddr) &&
478             !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
479                 return rt;
480
481         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
482                 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
483
484                 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
485                         continue;
486
487                 if (oif) {
488                         if (dev->ifindex == oif)
489                                 return sprt;
490                 } else {
491                         if (ipv6_chk_addr(net, saddr, dev,
492                                           flags & RT6_LOOKUP_F_IFACE))
493                                 return sprt;
494                 }
495         }
496
497         if (oif && flags & RT6_LOOKUP_F_IFACE)
498                 return net->ipv6.fib6_null_entry;
499
500         return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 }
502
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504 struct __rt6_probe_work {
505         struct work_struct work;
506         struct in6_addr target;
507         struct net_device *dev;
508 };
509
510 static void rt6_probe_deferred(struct work_struct *w)
511 {
512         struct in6_addr mcaddr;
513         struct __rt6_probe_work *work =
514                 container_of(w, struct __rt6_probe_work, work);
515
516         addrconf_addr_solict_mult(&work->target, &mcaddr);
517         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518         dev_put(work->dev);
519         kfree(work);
520 }
521
522 static void rt6_probe(struct fib6_nh *fib6_nh)
523 {
524         struct __rt6_probe_work *work = NULL;
525         const struct in6_addr *nh_gw;
526         struct neighbour *neigh;
527         struct net_device *dev;
528         struct inet6_dev *idev;
529
530         /*
531          * Okay, this does not seem to be appropriate
532          * for now, however, we need to check if it
533          * is really so; aka Router Reachability Probing.
534          *
535          * Router Reachability Probe MUST be rate-limited
536          * to no more than one per minute.
537          */
538         if (fib6_nh->fib_nh_gw_family)
539                 return;
540
541         nh_gw = &fib6_nh->fib_nh_gw6;
542         dev = fib6_nh->fib_nh_dev;
543         rcu_read_lock_bh();
544         idev = __in6_dev_get(dev);
545         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
546         if (neigh) {
547                 if (neigh->nud_state & NUD_VALID)
548                         goto out;
549
550                 write_lock(&neigh->lock);
551                 if (!(neigh->nud_state & NUD_VALID) &&
552                     time_after(jiffies,
553                                neigh->updated + idev->cnf.rtr_probe_interval)) {
554                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
555                         if (work)
556                                 __neigh_set_probe_once(neigh);
557                 }
558                 write_unlock(&neigh->lock);
559         } else if (time_after(jiffies, fib6_nh->last_probe +
560                                        idev->cnf.rtr_probe_interval)) {
561                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562         }
563
564         if (work) {
565                 fib6_nh->last_probe = jiffies;
566                 INIT_WORK(&work->work, rt6_probe_deferred);
567                 work->target = *nh_gw;
568                 dev_hold(dev);
569                 work->dev = dev;
570                 schedule_work(&work->work);
571         }
572
573 out:
574         rcu_read_unlock_bh();
575 }
576 #else
577 static inline void rt6_probe(struct fib6_nh *fib6_nh)
578 {
579 }
580 #endif
581
582 /*
583  * Default Router Selection (RFC 2461 6.3.6)
584  */
585 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
586 {
587         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
588         struct neighbour *neigh;
589
590         rcu_read_lock_bh();
591         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
592                                           &fib6_nh->fib_nh_gw6);
593         if (neigh) {
594                 read_lock(&neigh->lock);
595                 if (neigh->nud_state & NUD_VALID)
596                         ret = RT6_NUD_SUCCEED;
597 #ifdef CONFIG_IPV6_ROUTER_PREF
598                 else if (!(neigh->nud_state & NUD_FAILED))
599                         ret = RT6_NUD_SUCCEED;
600                 else
601                         ret = RT6_NUD_FAIL_PROBE;
602 #endif
603                 read_unlock(&neigh->lock);
604         } else {
605                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
606                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
607         }
608         rcu_read_unlock_bh();
609
610         return ret;
611 }
612
613 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
614                            int strict)
615 {
616         int m = 0;
617
618         if (!oif || nh->fib_nh_dev->ifindex == oif)
619                 m = 2;
620
621         if (!m && (strict & RT6_LOOKUP_F_IFACE))
622                 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
625 #endif
626         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
627             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
628                 int n = rt6_check_neigh(nh);
629                 if (n < 0)
630                         return n;
631         }
632         return m;
633 }
634
635 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
636                        int oif, int strict, int *mpri, bool *do_rr)
637 {
638         bool match_do_rr = false;
639         bool rc = false;
640         int m;
641
642         if (nh->fib_nh_flags & RTNH_F_DEAD)
643                 goto out;
644
645         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
646             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
647             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
648                 goto out;
649
650         m = rt6_score_route(nh, fib6_flags, oif, strict);
651         if (m == RT6_NUD_FAIL_DO_RR) {
652                 match_do_rr = true;
653                 m = 0; /* lowest valid score */
654         } else if (m == RT6_NUD_FAIL_HARD) {
655                 goto out;
656         }
657
658         if (strict & RT6_LOOKUP_F_REACHABLE)
659                 rt6_probe(nh);
660
661         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
662         if (m > *mpri) {
663                 *do_rr = match_do_rr;
664                 *mpri = m;
665                 rc = true;
666         }
667 out:
668         return rc;
669 }
670
671 static void __find_rr_leaf(struct fib6_info *rt_start,
672                            struct fib6_info *nomatch, u32 metric,
673                            struct fib6_info **match, struct fib6_info **cont,
674                            int oif, int strict, bool *do_rr, int *mpri)
675 {
676         struct fib6_info *rt;
677
678         for (rt = rt_start;
679              rt && rt != nomatch;
680              rt = rcu_dereference(rt->fib6_next)) {
681                 struct fib6_nh *nh;
682
683                 if (cont && rt->fib6_metric != metric) {
684                         *cont = rt;
685                         return;
686                 }
687
688                 if (fib6_check_expired(rt))
689                         continue;
690
691                 nh = &rt->fib6_nh;
692                 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
693                         *match = rt;
694         }
695 }
696
697 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698                                       struct fib6_info *leaf,
699                                       struct fib6_info *rr_head,
700                                       u32 metric, int oif, int strict,
701                                       bool *do_rr)
702 {
703         struct fib6_info *match = NULL, *cont = NULL;
704         int mpri = -1;
705
706         __find_rr_leaf(rr_head, NULL, metric, &match, &cont,
707                        oif, strict, do_rr, &mpri);
708
709         __find_rr_leaf(leaf, rr_head, metric, &match, &cont,
710                        oif, strict, do_rr, &mpri);
711
712         if (match || !cont)
713                 return match;
714
715         __find_rr_leaf(cont, NULL, metric, &match, NULL,
716                        oif, strict, do_rr, &mpri);
717
718         return match;
719 }
720
721 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
722                                    int oif, int strict)
723 {
724         struct fib6_info *leaf = rcu_dereference(fn->leaf);
725         struct fib6_info *match, *rt0;
726         bool do_rr = false;
727         int key_plen;
728
729         if (!leaf || leaf == net->ipv6.fib6_null_entry)
730                 return net->ipv6.fib6_null_entry;
731
732         rt0 = rcu_dereference(fn->rr_ptr);
733         if (!rt0)
734                 rt0 = leaf;
735
736         /* Double check to make sure fn is not an intermediate node
737          * and fn->leaf does not points to its child's leaf
738          * (This might happen if all routes under fn are deleted from
739          * the tree and fib6_repair_tree() is called on the node.)
740          */
741         key_plen = rt0->fib6_dst.plen;
742 #ifdef CONFIG_IPV6_SUBTREES
743         if (rt0->fib6_src.plen)
744                 key_plen = rt0->fib6_src.plen;
745 #endif
746         if (fn->fn_bit != key_plen)
747                 return net->ipv6.fib6_null_entry;
748
749         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
750                              &do_rr);
751
752         if (do_rr) {
753                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
754
755                 /* no entries matched; do round-robin */
756                 if (!next || next->fib6_metric != rt0->fib6_metric)
757                         next = leaf;
758
759                 if (next != rt0) {
760                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
761                         /* make sure next is not being deleted from the tree */
762                         if (next->fib6_node)
763                                 rcu_assign_pointer(fn->rr_ptr, next);
764                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
765                 }
766         }
767
768         return match ? match : net->ipv6.fib6_null_entry;
769 }
770
771 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
772 {
773         return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
774 }
775
776 #ifdef CONFIG_IPV6_ROUTE_INFO
777 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
778                   const struct in6_addr *gwaddr)
779 {
780         struct net *net = dev_net(dev);
781         struct route_info *rinfo = (struct route_info *) opt;
782         struct in6_addr prefix_buf, *prefix;
783         unsigned int pref;
784         unsigned long lifetime;
785         struct fib6_info *rt;
786
787         if (len < sizeof(struct route_info)) {
788                 return -EINVAL;
789         }
790
791         /* Sanity check for prefix_len and length */
792         if (rinfo->length > 3) {
793                 return -EINVAL;
794         } else if (rinfo->prefix_len > 128) {
795                 return -EINVAL;
796         } else if (rinfo->prefix_len > 64) {
797                 if (rinfo->length < 2) {
798                         return -EINVAL;
799                 }
800         } else if (rinfo->prefix_len > 0) {
801                 if (rinfo->length < 1) {
802                         return -EINVAL;
803                 }
804         }
805
806         pref = rinfo->route_pref;
807         if (pref == ICMPV6_ROUTER_PREF_INVALID)
808                 return -EINVAL;
809
810         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
811
812         if (rinfo->length == 3)
813                 prefix = (struct in6_addr *)rinfo->prefix;
814         else {
815                 /* this function is safe */
816                 ipv6_addr_prefix(&prefix_buf,
817                                  (struct in6_addr *)rinfo->prefix,
818                                  rinfo->prefix_len);
819                 prefix = &prefix_buf;
820         }
821
822         if (rinfo->prefix_len == 0)
823                 rt = rt6_get_dflt_router(net, gwaddr, dev);
824         else
825                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
826                                         gwaddr, dev);
827
828         if (rt && !lifetime) {
829                 ip6_del_rt(net, rt);
830                 rt = NULL;
831         }
832
833         if (!rt && lifetime)
834                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
835                                         dev, pref);
836         else if (rt)
837                 rt->fib6_flags = RTF_ROUTEINFO |
838                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
839
840         if (rt) {
841                 if (!addrconf_finite_timeout(lifetime))
842                         fib6_clean_expires(rt);
843                 else
844                         fib6_set_expires(rt, jiffies + HZ * lifetime);
845
846                 fib6_info_release(rt);
847         }
848         return 0;
849 }
850 #endif
851
852 /*
853  *      Misc support functions
854  */
855
856 /* called with rcu_lock held */
857 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
858 {
859         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
860
861         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
862                 /* for copies of local routes, dst->dev needs to be the
863                  * device if it is a master device, the master device if
864                  * device is enslaved, and the loopback as the default
865                  */
866                 if (netif_is_l3_slave(dev) &&
867                     !rt6_need_strict(&rt->fib6_dst.addr))
868                         dev = l3mdev_master_dev_rcu(dev);
869                 else if (!netif_is_l3_master(dev))
870                         dev = dev_net(dev)->loopback_dev;
871                 /* last case is netif_is_l3_master(dev) is true in which
872                  * case we want dev returned to be dev
873                  */
874         }
875
876         return dev;
877 }
878
879 static const int fib6_prop[RTN_MAX + 1] = {
880         [RTN_UNSPEC]    = 0,
881         [RTN_UNICAST]   = 0,
882         [RTN_LOCAL]     = 0,
883         [RTN_BROADCAST] = 0,
884         [RTN_ANYCAST]   = 0,
885         [RTN_MULTICAST] = 0,
886         [RTN_BLACKHOLE] = -EINVAL,
887         [RTN_UNREACHABLE] = -EHOSTUNREACH,
888         [RTN_PROHIBIT]  = -EACCES,
889         [RTN_THROW]     = -EAGAIN,
890         [RTN_NAT]       = -EINVAL,
891         [RTN_XRESOLVE]  = -EINVAL,
892 };
893
894 static int ip6_rt_type_to_error(u8 fib6_type)
895 {
896         return fib6_prop[fib6_type];
897 }
898
899 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
900 {
901         unsigned short flags = 0;
902
903         if (rt->dst_nocount)
904                 flags |= DST_NOCOUNT;
905         if (rt->dst_nopolicy)
906                 flags |= DST_NOPOLICY;
907         if (rt->dst_host)
908                 flags |= DST_HOST;
909
910         return flags;
911 }
912
913 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
914 {
915         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
916
917         switch (ort->fib6_type) {
918         case RTN_BLACKHOLE:
919                 rt->dst.output = dst_discard_out;
920                 rt->dst.input = dst_discard;
921                 break;
922         case RTN_PROHIBIT:
923                 rt->dst.output = ip6_pkt_prohibit_out;
924                 rt->dst.input = ip6_pkt_prohibit;
925                 break;
926         case RTN_THROW:
927         case RTN_UNREACHABLE:
928         default:
929                 rt->dst.output = ip6_pkt_discard_out;
930                 rt->dst.input = ip6_pkt_discard;
931                 break;
932         }
933 }
934
935 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
936 {
937         if (ort->fib6_flags & RTF_REJECT) {
938                 ip6_rt_init_dst_reject(rt, ort);
939                 return;
940         }
941
942         rt->dst.error = 0;
943         rt->dst.output = ip6_output;
944
945         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
946                 rt->dst.input = ip6_input;
947         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
948                 rt->dst.input = ip6_mc_input;
949         } else {
950                 rt->dst.input = ip6_forward;
951         }
952
953         if (ort->fib6_nh.fib_nh_lws) {
954                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
955                 lwtunnel_set_redirect(&rt->dst);
956         }
957
958         rt->dst.lastuse = jiffies;
959 }
960
961 /* Caller must already hold reference to @from */
962 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
963 {
964         rt->rt6i_flags &= ~RTF_EXPIRES;
965         rcu_assign_pointer(rt->from, from);
966         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
967 }
968
969 /* Caller must already hold reference to @ort */
970 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
971 {
972         struct net_device *dev = fib6_info_nh_dev(ort);
973
974         ip6_rt_init_dst(rt, ort);
975
976         rt->rt6i_dst = ort->fib6_dst;
977         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
978         rt->rt6i_flags = ort->fib6_flags;
979         if (ort->fib6_nh.fib_nh_gw_family) {
980                 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
981                 rt->rt6i_flags |= RTF_GATEWAY;
982         }
983         rt6_set_from(rt, ort);
984 #ifdef CONFIG_IPV6_SUBTREES
985         rt->rt6i_src = ort->fib6_src;
986 #endif
987 }
988
989 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
990                                         struct in6_addr *saddr)
991 {
992         struct fib6_node *pn, *sn;
993         while (1) {
994                 if (fn->fn_flags & RTN_TL_ROOT)
995                         return NULL;
996                 pn = rcu_dereference(fn->parent);
997                 sn = FIB6_SUBTREE(pn);
998                 if (sn && sn != fn)
999                         fn = fib6_node_lookup(sn, NULL, saddr);
1000                 else
1001                         fn = pn;
1002                 if (fn->fn_flags & RTN_RTINFO)
1003                         return fn;
1004         }
1005 }
1006
1007 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1008 {
1009         struct rt6_info *rt = *prt;
1010
1011         if (dst_hold_safe(&rt->dst))
1012                 return true;
1013         if (net) {
1014                 rt = net->ipv6.ip6_null_entry;
1015                 dst_hold(&rt->dst);
1016         } else {
1017                 rt = NULL;
1018         }
1019         *prt = rt;
1020         return false;
1021 }
1022
1023 /* called with rcu_lock held */
1024 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1025 {
1026         unsigned short flags = fib6_info_dst_flags(rt);
1027         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1028         struct rt6_info *nrt;
1029
1030         if (!fib6_info_hold_safe(rt))
1031                 goto fallback;
1032
1033         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1034         if (!nrt) {
1035                 fib6_info_release(rt);
1036                 goto fallback;
1037         }
1038
1039         ip6_rt_copy_init(nrt, rt);
1040         return nrt;
1041
1042 fallback:
1043         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1044         dst_hold(&nrt->dst);
1045         return nrt;
1046 }
1047
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049                                              struct fib6_table *table,
1050                                              struct flowi6 *fl6,
1051                                              const struct sk_buff *skb,
1052                                              int flags)
1053 {
1054         struct fib6_info *f6i;
1055         struct fib6_node *fn;
1056         struct rt6_info *rt;
1057
1058         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059                 flags &= ~RT6_LOOKUP_F_IFACE;
1060
1061         rcu_read_lock();
1062         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064         f6i = rcu_dereference(fn->leaf);
1065         if (!f6i)
1066                 f6i = net->ipv6.fib6_null_entry;
1067         else
1068                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069                                       fl6->flowi6_oif, flags);
1070
1071         if (f6i == net->ipv6.fib6_null_entry) {
1072                 fn = fib6_backtrack(fn, &fl6->saddr);
1073                 if (fn)
1074                         goto restart;
1075
1076                 rt = net->ipv6.ip6_null_entry;
1077                 dst_hold(&rt->dst);
1078                 goto out;
1079         }
1080
1081         if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1082                 f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1083                                             flags);
1084         /* Search through exception table */
1085         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1086         if (rt) {
1087                 if (ip6_hold_safe(net, &rt))
1088                         dst_use_noref(&rt->dst, jiffies);
1089         } else {
1090                 rt = ip6_create_rt_rcu(f6i);
1091         }
1092
1093 out:
1094         trace_fib6_table_lookup(net, f6i, table, fl6);
1095
1096         rcu_read_unlock();
1097
1098         return rt;
1099 }
1100
1101 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1102                                    const struct sk_buff *skb, int flags)
1103 {
1104         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1107
1108 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1109                             const struct in6_addr *saddr, int oif,
1110                             const struct sk_buff *skb, int strict)
1111 {
1112         struct flowi6 fl6 = {
1113                 .flowi6_oif = oif,
1114                 .daddr = *daddr,
1115         };
1116         struct dst_entry *dst;
1117         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1118
1119         if (saddr) {
1120                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1121                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1122         }
1123
1124         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1125         if (dst->error == 0)
1126                 return (struct rt6_info *) dst;
1127
1128         dst_release(dst);
1129
1130         return NULL;
1131 }
1132 EXPORT_SYMBOL(rt6_lookup);
1133
1134 /* ip6_ins_rt is called with FREE table->tb6_lock.
1135  * It takes new route entry, the addition fails by any reason the
1136  * route is released.
1137  * Caller must hold dst before calling it.
1138  */
1139
1140 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1141                         struct netlink_ext_ack *extack)
1142 {
1143         int err;
1144         struct fib6_table *table;
1145
1146         table = rt->fib6_table;
1147         spin_lock_bh(&table->tb6_lock);
1148         err = fib6_add(&table->tb6_root, rt, info, extack);
1149         spin_unlock_bh(&table->tb6_lock);
1150
1151         return err;
1152 }
1153
1154 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1155 {
1156         struct nl_info info = { .nl_net = net, };
1157
1158         return __ip6_ins_rt(rt, &info, NULL);
1159 }
1160
1161 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1162                                            const struct in6_addr *daddr,
1163                                            const struct in6_addr *saddr)
1164 {
1165         struct net_device *dev;
1166         struct rt6_info *rt;
1167
1168         /*
1169          *      Clone the route.
1170          */
1171
1172         if (!fib6_info_hold_safe(ort))
1173                 return NULL;
1174
1175         dev = ip6_rt_get_dev_rcu(ort);
1176         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1177         if (!rt) {
1178                 fib6_info_release(ort);
1179                 return NULL;
1180         }
1181
1182         ip6_rt_copy_init(rt, ort);
1183         rt->rt6i_flags |= RTF_CACHE;
1184         rt->dst.flags |= DST_HOST;
1185         rt->rt6i_dst.addr = *daddr;
1186         rt->rt6i_dst.plen = 128;
1187
1188         if (!rt6_is_gw_or_nonexthop(ort)) {
1189                 if (ort->fib6_dst.plen != 128 &&
1190                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1191                         rt->rt6i_flags |= RTF_ANYCAST;
1192 #ifdef CONFIG_IPV6_SUBTREES
1193                 if (rt->rt6i_src.plen && saddr) {
1194                         rt->rt6i_src.addr = *saddr;
1195                         rt->rt6i_src.plen = 128;
1196                 }
1197 #endif
1198         }
1199
1200         return rt;
1201 }
1202
1203 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1204 {
1205         unsigned short flags = fib6_info_dst_flags(rt);
1206         struct net_device *dev;
1207         struct rt6_info *pcpu_rt;
1208
1209         if (!fib6_info_hold_safe(rt))
1210                 return NULL;
1211
1212         rcu_read_lock();
1213         dev = ip6_rt_get_dev_rcu(rt);
1214         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1215         rcu_read_unlock();
1216         if (!pcpu_rt) {
1217                 fib6_info_release(rt);
1218                 return NULL;
1219         }
1220         ip6_rt_copy_init(pcpu_rt, rt);
1221         pcpu_rt->rt6i_flags |= RTF_PCPU;
1222         return pcpu_rt;
1223 }
1224
1225 /* It should be called with rcu_read_lock() acquired */
1226 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1227 {
1228         struct rt6_info *pcpu_rt, **p;
1229
1230         p = this_cpu_ptr(rt->rt6i_pcpu);
1231         pcpu_rt = *p;
1232
1233         if (pcpu_rt)
1234                 ip6_hold_safe(NULL, &pcpu_rt);
1235
1236         return pcpu_rt;
1237 }
1238
1239 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1240                                             struct fib6_info *rt)
1241 {
1242         struct rt6_info *pcpu_rt, *prev, **p;
1243
1244         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1245         if (!pcpu_rt) {
1246                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1247                 return net->ipv6.ip6_null_entry;
1248         }
1249
1250         dst_hold(&pcpu_rt->dst);
1251         p = this_cpu_ptr(rt->rt6i_pcpu);
1252         prev = cmpxchg(p, NULL, pcpu_rt);
1253         BUG_ON(prev);
1254
1255         return pcpu_rt;
1256 }
1257
1258 /* exception hash table implementation
1259  */
1260 static DEFINE_SPINLOCK(rt6_exception_lock);
1261
1262 /* Remove rt6_ex from hash table and free the memory
1263  * Caller must hold rt6_exception_lock
1264  */
1265 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1266                                  struct rt6_exception *rt6_ex)
1267 {
1268         struct fib6_info *from;
1269         struct net *net;
1270
1271         if (!bucket || !rt6_ex)
1272                 return;
1273
1274         net = dev_net(rt6_ex->rt6i->dst.dev);
1275         net->ipv6.rt6_stats->fib_rt_cache--;
1276
1277         /* purge completely the exception to allow releasing the held resources:
1278          * some [sk] cache may keep the dst around for unlimited time
1279          */
1280         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1281                                          lockdep_is_held(&rt6_exception_lock));
1282         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1283         fib6_info_release(from);
1284         dst_dev_put(&rt6_ex->rt6i->dst);
1285
1286         hlist_del_rcu(&rt6_ex->hlist);
1287         dst_release(&rt6_ex->rt6i->dst);
1288         kfree_rcu(rt6_ex, rcu);
1289         WARN_ON_ONCE(!bucket->depth);
1290         bucket->depth--;
1291 }
1292
1293 /* Remove oldest rt6_ex in bucket and free the memory
1294  * Caller must hold rt6_exception_lock
1295  */
1296 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1297 {
1298         struct rt6_exception *rt6_ex, *oldest = NULL;
1299
1300         if (!bucket)
1301                 return;
1302
1303         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1304                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1305                         oldest = rt6_ex;
1306         }
1307         rt6_remove_exception(bucket, oldest);
1308 }
1309
1310 static u32 rt6_exception_hash(const struct in6_addr *dst,
1311                               const struct in6_addr *src)
1312 {
1313         static u32 seed __read_mostly;
1314         u32 val;
1315
1316         net_get_random_once(&seed, sizeof(seed));
1317         val = jhash(dst, sizeof(*dst), seed);
1318
1319 #ifdef CONFIG_IPV6_SUBTREES
1320         if (src)
1321                 val = jhash(src, sizeof(*src), val);
1322 #endif
1323         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1324 }
1325
1326 /* Helper function to find the cached rt in the hash table
1327  * and update bucket pointer to point to the bucket for this
1328  * (daddr, saddr) pair
1329  * Caller must hold rt6_exception_lock
1330  */
1331 static struct rt6_exception *
1332 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1333                               const struct in6_addr *daddr,
1334                               const struct in6_addr *saddr)
1335 {
1336         struct rt6_exception *rt6_ex;
1337         u32 hval;
1338
1339         if (!(*bucket) || !daddr)
1340                 return NULL;
1341
1342         hval = rt6_exception_hash(daddr, saddr);
1343         *bucket += hval;
1344
1345         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1346                 struct rt6_info *rt6 = rt6_ex->rt6i;
1347                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1348
1349 #ifdef CONFIG_IPV6_SUBTREES
1350                 if (matched && saddr)
1351                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1352 #endif
1353                 if (matched)
1354                         return rt6_ex;
1355         }
1356         return NULL;
1357 }
1358
1359 /* Helper function to find the cached rt in the hash table
1360  * and update bucket pointer to point to the bucket for this
1361  * (daddr, saddr) pair
1362  * Caller must hold rcu_read_lock()
1363  */
1364 static struct rt6_exception *
1365 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1366                          const struct in6_addr *daddr,
1367                          const struct in6_addr *saddr)
1368 {
1369         struct rt6_exception *rt6_ex;
1370         u32 hval;
1371
1372         WARN_ON_ONCE(!rcu_read_lock_held());
1373
1374         if (!(*bucket) || !daddr)
1375                 return NULL;
1376
1377         hval = rt6_exception_hash(daddr, saddr);
1378         *bucket += hval;
1379
1380         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1381                 struct rt6_info *rt6 = rt6_ex->rt6i;
1382                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1383
1384 #ifdef CONFIG_IPV6_SUBTREES
1385                 if (matched && saddr)
1386                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1387 #endif
1388                 if (matched)
1389                         return rt6_ex;
1390         }
1391         return NULL;
1392 }
1393
1394 static unsigned int fib6_mtu(const struct fib6_info *rt)
1395 {
1396         unsigned int mtu;
1397
1398         if (rt->fib6_pmtu) {
1399                 mtu = rt->fib6_pmtu;
1400         } else {
1401                 struct net_device *dev = fib6_info_nh_dev(rt);
1402                 struct inet6_dev *idev;
1403
1404                 rcu_read_lock();
1405                 idev = __in6_dev_get(dev);
1406                 mtu = idev->cnf.mtu6;
1407                 rcu_read_unlock();
1408         }
1409
1410         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1411
1412         return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1413 }
1414
1415 static int rt6_insert_exception(struct rt6_info *nrt,
1416                                 struct fib6_info *ort)
1417 {
1418         struct net *net = dev_net(nrt->dst.dev);
1419         struct rt6_exception_bucket *bucket;
1420         struct in6_addr *src_key = NULL;
1421         struct rt6_exception *rt6_ex;
1422         int err = 0;
1423
1424         spin_lock_bh(&rt6_exception_lock);
1425
1426         if (ort->exception_bucket_flushed) {
1427                 err = -EINVAL;
1428                 goto out;
1429         }
1430
1431         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1432                                         lockdep_is_held(&rt6_exception_lock));
1433         if (!bucket) {
1434                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1435                                  GFP_ATOMIC);
1436                 if (!bucket) {
1437                         err = -ENOMEM;
1438                         goto out;
1439                 }
1440                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1441         }
1442
1443 #ifdef CONFIG_IPV6_SUBTREES
1444         /* rt6i_src.plen != 0 indicates ort is in subtree
1445          * and exception table is indexed by a hash of
1446          * both rt6i_dst and rt6i_src.
1447          * Otherwise, the exception table is indexed by
1448          * a hash of only rt6i_dst.
1449          */
1450         if (ort->fib6_src.plen)
1451                 src_key = &nrt->rt6i_src.addr;
1452 #endif
1453         /* rt6_mtu_change() might lower mtu on ort.
1454          * Only insert this exception route if its mtu
1455          * is less than ort's mtu value.
1456          */
1457         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1458                 err = -EINVAL;
1459                 goto out;
1460         }
1461
1462         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1463                                                src_key);
1464         if (rt6_ex)
1465                 rt6_remove_exception(bucket, rt6_ex);
1466
1467         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1468         if (!rt6_ex) {
1469                 err = -ENOMEM;
1470                 goto out;
1471         }
1472         rt6_ex->rt6i = nrt;
1473         rt6_ex->stamp = jiffies;
1474         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1475         bucket->depth++;
1476         net->ipv6.rt6_stats->fib_rt_cache++;
1477
1478         if (bucket->depth > FIB6_MAX_DEPTH)
1479                 rt6_exception_remove_oldest(bucket);
1480
1481 out:
1482         spin_unlock_bh(&rt6_exception_lock);
1483
1484         /* Update fn->fn_sernum to invalidate all cached dst */
1485         if (!err) {
1486                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_update_sernum(net, ort);
1488                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1489                 fib6_force_start_gc(net);
1490         }
1491
1492         return err;
1493 }
1494
1495 void rt6_flush_exceptions(struct fib6_info *rt)
1496 {
1497         struct rt6_exception_bucket *bucket;
1498         struct rt6_exception *rt6_ex;
1499         struct hlist_node *tmp;
1500         int i;
1501
1502         spin_lock_bh(&rt6_exception_lock);
1503         /* Prevent rt6_insert_exception() to recreate the bucket list */
1504         rt->exception_bucket_flushed = 1;
1505
1506         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1507                                     lockdep_is_held(&rt6_exception_lock));
1508         if (!bucket)
1509                 goto out;
1510
1511         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1513                         rt6_remove_exception(bucket, rt6_ex);
1514                 WARN_ON_ONCE(bucket->depth);
1515                 bucket++;
1516         }
1517
1518 out:
1519         spin_unlock_bh(&rt6_exception_lock);
1520 }
1521
1522 /* Find cached rt in the hash table inside passed in rt
1523  * Caller has to hold rcu_read_lock()
1524  */
1525 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1526                                            struct in6_addr *daddr,
1527                                            struct in6_addr *saddr)
1528 {
1529         struct rt6_exception_bucket *bucket;
1530         struct in6_addr *src_key = NULL;
1531         struct rt6_exception *rt6_ex;
1532         struct rt6_info *res = NULL;
1533
1534         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1535
1536 #ifdef CONFIG_IPV6_SUBTREES
1537         /* rt6i_src.plen != 0 indicates rt is in subtree
1538          * and exception table is indexed by a hash of
1539          * both rt6i_dst and rt6i_src.
1540          * Otherwise, the exception table is indexed by
1541          * a hash of only rt6i_dst.
1542          */
1543         if (rt->fib6_src.plen)
1544                 src_key = saddr;
1545 #endif
1546         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1547
1548         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1549                 res = rt6_ex->rt6i;
1550
1551         return res;
1552 }
1553
1554 /* Remove the passed in cached rt from the hash table that contains it */
1555 static int rt6_remove_exception_rt(struct rt6_info *rt)
1556 {
1557         struct rt6_exception_bucket *bucket;
1558         struct in6_addr *src_key = NULL;
1559         struct rt6_exception *rt6_ex;
1560         struct fib6_info *from;
1561         int err;
1562
1563         from = rcu_dereference(rt->from);
1564         if (!from ||
1565             !(rt->rt6i_flags & RTF_CACHE))
1566                 return -EINVAL;
1567
1568         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1569                 return -ENOENT;
1570
1571         spin_lock_bh(&rt6_exception_lock);
1572         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1573                                     lockdep_is_held(&rt6_exception_lock));
1574 #ifdef CONFIG_IPV6_SUBTREES
1575         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1576          * and exception table is indexed by a hash of
1577          * both rt6i_dst and rt6i_src.
1578          * Otherwise, the exception table is indexed by
1579          * a hash of only rt6i_dst.
1580          */
1581         if (from->fib6_src.plen)
1582                 src_key = &rt->rt6i_src.addr;
1583 #endif
1584         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1585                                                &rt->rt6i_dst.addr,
1586                                                src_key);
1587         if (rt6_ex) {
1588                 rt6_remove_exception(bucket, rt6_ex);
1589                 err = 0;
1590         } else {
1591                 err = -ENOENT;
1592         }
1593
1594         spin_unlock_bh(&rt6_exception_lock);
1595         return err;
1596 }
1597
1598 /* Find rt6_ex which contains the passed in rt cache and
1599  * refresh its stamp
1600  */
1601 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1602 {
1603         struct rt6_exception_bucket *bucket;
1604         struct in6_addr *src_key = NULL;
1605         struct rt6_exception *rt6_ex;
1606         struct fib6_info *from;
1607
1608         rcu_read_lock();
1609         from = rcu_dereference(rt->from);
1610         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1611                 goto unlock;
1612
1613         bucket = rcu_dereference(from->rt6i_exception_bucket);
1614
1615 #ifdef CONFIG_IPV6_SUBTREES
1616         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1617          * and exception table is indexed by a hash of
1618          * both rt6i_dst and rt6i_src.
1619          * Otherwise, the exception table is indexed by
1620          * a hash of only rt6i_dst.
1621          */
1622         if (from->fib6_src.plen)
1623                 src_key = &rt->rt6i_src.addr;
1624 #endif
1625         rt6_ex = __rt6_find_exception_rcu(&bucket,
1626                                           &rt->rt6i_dst.addr,
1627                                           src_key);
1628         if (rt6_ex)
1629                 rt6_ex->stamp = jiffies;
1630
1631 unlock:
1632         rcu_read_unlock();
1633 }
1634
1635 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1636                                          struct rt6_info *rt, int mtu)
1637 {
1638         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1639          * lowest MTU in the path: always allow updating the route PMTU to
1640          * reflect PMTU decreases.
1641          *
1642          * If the new MTU is higher, and the route PMTU is equal to the local
1643          * MTU, this means the old MTU is the lowest in the path, so allow
1644          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1645          * handle this.
1646          */
1647
1648         if (dst_mtu(&rt->dst) >= mtu)
1649                 return true;
1650
1651         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1652                 return true;
1653
1654         return false;
1655 }
1656
1657 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1658                                        struct fib6_info *rt, int mtu)
1659 {
1660         struct rt6_exception_bucket *bucket;
1661         struct rt6_exception *rt6_ex;
1662         int i;
1663
1664         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1665                                         lockdep_is_held(&rt6_exception_lock));
1666
1667         if (!bucket)
1668                 return;
1669
1670         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1672                         struct rt6_info *entry = rt6_ex->rt6i;
1673
1674                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1675                          * route), the metrics of its rt->from have already
1676                          * been updated.
1677                          */
1678                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1679                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1680                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1681                 }
1682                 bucket++;
1683         }
1684 }
1685
1686 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1687
1688 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1689                                         struct in6_addr *gateway)
1690 {
1691         struct rt6_exception_bucket *bucket;
1692         struct rt6_exception *rt6_ex;
1693         struct hlist_node *tmp;
1694         int i;
1695
1696         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1697                 return;
1698
1699         spin_lock_bh(&rt6_exception_lock);
1700         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1701                                      lockdep_is_held(&rt6_exception_lock));
1702
1703         if (bucket) {
1704                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1705                         hlist_for_each_entry_safe(rt6_ex, tmp,
1706                                                   &bucket->chain, hlist) {
1707                                 struct rt6_info *entry = rt6_ex->rt6i;
1708
1709                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1710                                     RTF_CACHE_GATEWAY &&
1711                                     ipv6_addr_equal(gateway,
1712                                                     &entry->rt6i_gateway)) {
1713                                         rt6_remove_exception(bucket, rt6_ex);
1714                                 }
1715                         }
1716                         bucket++;
1717                 }
1718         }
1719
1720         spin_unlock_bh(&rt6_exception_lock);
1721 }
1722
1723 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1724                                       struct rt6_exception *rt6_ex,
1725                                       struct fib6_gc_args *gc_args,
1726                                       unsigned long now)
1727 {
1728         struct rt6_info *rt = rt6_ex->rt6i;
1729
1730         /* we are pruning and obsoleting aged-out and non gateway exceptions
1731          * even if others have still references to them, so that on next
1732          * dst_check() such references can be dropped.
1733          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1734          * expired, independently from their aging, as per RFC 8201 section 4
1735          */
1736         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1737                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1738                         RT6_TRACE("aging clone %p\n", rt);
1739                         rt6_remove_exception(bucket, rt6_ex);
1740                         return;
1741                 }
1742         } else if (time_after(jiffies, rt->dst.expires)) {
1743                 RT6_TRACE("purging expired route %p\n", rt);
1744                 rt6_remove_exception(bucket, rt6_ex);
1745                 return;
1746         }
1747
1748         if (rt->rt6i_flags & RTF_GATEWAY) {
1749                 struct neighbour *neigh;
1750                 __u8 neigh_flags = 0;
1751
1752                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1753                 if (neigh)
1754                         neigh_flags = neigh->flags;
1755
1756                 if (!(neigh_flags & NTF_ROUTER)) {
1757                         RT6_TRACE("purging route %p via non-router but gateway\n",
1758                                   rt);
1759                         rt6_remove_exception(bucket, rt6_ex);
1760                         return;
1761                 }
1762         }
1763
1764         gc_args->more++;
1765 }
1766
1767 void rt6_age_exceptions(struct fib6_info *rt,
1768                         struct fib6_gc_args *gc_args,
1769                         unsigned long now)
1770 {
1771         struct rt6_exception_bucket *bucket;
1772         struct rt6_exception *rt6_ex;
1773         struct hlist_node *tmp;
1774         int i;
1775
1776         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1777                 return;
1778
1779         rcu_read_lock_bh();
1780         spin_lock(&rt6_exception_lock);
1781         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1782                                     lockdep_is_held(&rt6_exception_lock));
1783
1784         if (bucket) {
1785                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1786                         hlist_for_each_entry_safe(rt6_ex, tmp,
1787                                                   &bucket->chain, hlist) {
1788                                 rt6_age_examine_exception(bucket, rt6_ex,
1789                                                           gc_args, now);
1790                         }
1791                         bucket++;
1792                 }
1793         }
1794         spin_unlock(&rt6_exception_lock);
1795         rcu_read_unlock_bh();
1796 }
1797
1798 /* must be called with rcu lock held */
1799 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1800                                     int oif, struct flowi6 *fl6, int strict)
1801 {
1802         struct fib6_node *fn, *saved_fn;
1803         struct fib6_info *f6i;
1804
1805         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1806         saved_fn = fn;
1807
1808         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1809                 oif = 0;
1810
1811 redo_rt6_select:
1812         f6i = rt6_select(net, fn, oif, strict);
1813         if (f6i == net->ipv6.fib6_null_entry) {
1814                 fn = fib6_backtrack(fn, &fl6->saddr);
1815                 if (fn)
1816                         goto redo_rt6_select;
1817                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1818                         /* also consider unreachable route */
1819                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1820                         fn = saved_fn;
1821                         goto redo_rt6_select;
1822                 }
1823         }
1824
1825         trace_fib6_table_lookup(net, f6i, table, fl6);
1826
1827         return f6i;
1828 }
1829
1830 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1831                                int oif, struct flowi6 *fl6,
1832                                const struct sk_buff *skb, int flags)
1833 {
1834         struct fib6_info *f6i;
1835         struct rt6_info *rt;
1836         int strict = 0;
1837
1838         strict |= flags & RT6_LOOKUP_F_IFACE;
1839         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1840         if (net->ipv6.devconf_all->forwarding == 0)
1841                 strict |= RT6_LOOKUP_F_REACHABLE;
1842
1843         rcu_read_lock();
1844
1845         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1846         if (f6i->fib6_nsiblings)
1847                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1848
1849         if (f6i == net->ipv6.fib6_null_entry) {
1850                 rt = net->ipv6.ip6_null_entry;
1851                 rcu_read_unlock();
1852                 dst_hold(&rt->dst);
1853                 return rt;
1854         }
1855
1856         /*Search through exception table */
1857         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1858         if (rt) {
1859                 if (ip6_hold_safe(net, &rt))
1860                         dst_use_noref(&rt->dst, jiffies);
1861
1862                 rcu_read_unlock();
1863                 return rt;
1864         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1865                             !f6i->fib6_nh.fib_nh_gw_family)) {
1866                 /* Create a RTF_CACHE clone which will not be
1867                  * owned by the fib6 tree.  It is for the special case where
1868                  * the daddr in the skb during the neighbor look-up is different
1869                  * from the fl6->daddr used to look-up route here.
1870                  */
1871                 struct rt6_info *uncached_rt;
1872
1873                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1874
1875                 rcu_read_unlock();
1876
1877                 if (uncached_rt) {
1878                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1879                          * No need for another dst_hold()
1880                          */
1881                         rt6_uncached_list_add(uncached_rt);
1882                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1883                 } else {
1884                         uncached_rt = net->ipv6.ip6_null_entry;
1885                         dst_hold(&uncached_rt->dst);
1886                 }
1887
1888                 return uncached_rt;
1889         } else {
1890                 /* Get a percpu copy */
1891
1892                 struct rt6_info *pcpu_rt;
1893
1894                 local_bh_disable();
1895                 pcpu_rt = rt6_get_pcpu_route(f6i);
1896
1897                 if (!pcpu_rt)
1898                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1899
1900                 local_bh_enable();
1901                 rcu_read_unlock();
1902
1903                 return pcpu_rt;
1904         }
1905 }
1906 EXPORT_SYMBOL_GPL(ip6_pol_route);
1907
1908 static struct rt6_info *ip6_pol_route_input(struct net *net,
1909                                             struct fib6_table *table,
1910                                             struct flowi6 *fl6,
1911                                             const struct sk_buff *skb,
1912                                             int flags)
1913 {
1914         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1915 }
1916
1917 struct dst_entry *ip6_route_input_lookup(struct net *net,
1918                                          struct net_device *dev,
1919                                          struct flowi6 *fl6,
1920                                          const struct sk_buff *skb,
1921                                          int flags)
1922 {
1923         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1924                 flags |= RT6_LOOKUP_F_IFACE;
1925
1926         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1927 }
1928 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1929
1930 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1931                                   struct flow_keys *keys,
1932                                   struct flow_keys *flkeys)
1933 {
1934         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1935         const struct ipv6hdr *key_iph = outer_iph;
1936         struct flow_keys *_flkeys = flkeys;
1937         const struct ipv6hdr *inner_iph;
1938         const struct icmp6hdr *icmph;
1939         struct ipv6hdr _inner_iph;
1940         struct icmp6hdr _icmph;
1941
1942         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1943                 goto out;
1944
1945         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1946                                    sizeof(_icmph), &_icmph);
1947         if (!icmph)
1948                 goto out;
1949
1950         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1951             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1952             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1953             icmph->icmp6_type != ICMPV6_PARAMPROB)
1954                 goto out;
1955
1956         inner_iph = skb_header_pointer(skb,
1957                                        skb_transport_offset(skb) + sizeof(*icmph),
1958                                        sizeof(_inner_iph), &_inner_iph);
1959         if (!inner_iph)
1960                 goto out;
1961
1962         key_iph = inner_iph;
1963         _flkeys = NULL;
1964 out:
1965         if (_flkeys) {
1966                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1967                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1968                 keys->tags.flow_label = _flkeys->tags.flow_label;
1969                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1970         } else {
1971                 keys->addrs.v6addrs.src = key_iph->saddr;
1972                 keys->addrs.v6addrs.dst = key_iph->daddr;
1973                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1974                 keys->basic.ip_proto = key_iph->nexthdr;
1975         }
1976 }
1977
1978 /* if skb is set it will be used and fl6 can be NULL */
1979 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1980                        const struct sk_buff *skb, struct flow_keys *flkeys)
1981 {
1982         struct flow_keys hash_keys;
1983         u32 mhash;
1984
1985         switch (ip6_multipath_hash_policy(net)) {
1986         case 0:
1987                 memset(&hash_keys, 0, sizeof(hash_keys));
1988                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1989                 if (skb) {
1990                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1991                 } else {
1992                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1993                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1994                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1995                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1996                 }
1997                 break;
1998         case 1:
1999                 if (skb) {
2000                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2001                         struct flow_keys keys;
2002
2003                         /* short-circuit if we already have L4 hash present */
2004                         if (skb->l4_hash)
2005                                 return skb_get_hash_raw(skb) >> 1;
2006
2007                         memset(&hash_keys, 0, sizeof(hash_keys));
2008
2009                         if (!flkeys) {
2010                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2011                                 flkeys = &keys;
2012                         }
2013                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2014                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2015                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2016                         hash_keys.ports.src = flkeys->ports.src;
2017                         hash_keys.ports.dst = flkeys->ports.dst;
2018                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2019                 } else {
2020                         memset(&hash_keys, 0, sizeof(hash_keys));
2021                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2022                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2023                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2024                         hash_keys.ports.src = fl6->fl6_sport;
2025                         hash_keys.ports.dst = fl6->fl6_dport;
2026                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2027                 }
2028                 break;
2029         }
2030         mhash = flow_hash_from_keys(&hash_keys);
2031
2032         return mhash >> 1;
2033 }
2034
2035 void ip6_route_input(struct sk_buff *skb)
2036 {
2037         const struct ipv6hdr *iph = ipv6_hdr(skb);
2038         struct net *net = dev_net(skb->dev);
2039         int flags = RT6_LOOKUP_F_HAS_SADDR;
2040         struct ip_tunnel_info *tun_info;
2041         struct flowi6 fl6 = {
2042                 .flowi6_iif = skb->dev->ifindex,
2043                 .daddr = iph->daddr,
2044                 .saddr = iph->saddr,
2045                 .flowlabel = ip6_flowinfo(iph),
2046                 .flowi6_mark = skb->mark,
2047                 .flowi6_proto = iph->nexthdr,
2048         };
2049         struct flow_keys *flkeys = NULL, _flkeys;
2050
2051         tun_info = skb_tunnel_info(skb);
2052         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2053                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2054
2055         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2056                 flkeys = &_flkeys;
2057
2058         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2059                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2060         skb_dst_drop(skb);
2061         skb_dst_set(skb,
2062                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2063 }
2064
2065 static struct rt6_info *ip6_pol_route_output(struct net *net,
2066                                              struct fib6_table *table,
2067                                              struct flowi6 *fl6,
2068                                              const struct sk_buff *skb,
2069                                              int flags)
2070 {
2071         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2072 }
2073
2074 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2075                                          struct flowi6 *fl6, int flags)
2076 {
2077         bool any_src;
2078
2079         if (ipv6_addr_type(&fl6->daddr) &
2080             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2081                 struct dst_entry *dst;
2082
2083                 dst = l3mdev_link_scope_lookup(net, fl6);
2084                 if (dst)
2085                         return dst;
2086         }
2087
2088         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2089
2090         any_src = ipv6_addr_any(&fl6->saddr);
2091         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2092             (fl6->flowi6_oif && any_src))
2093                 flags |= RT6_LOOKUP_F_IFACE;
2094
2095         if (!any_src)
2096                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2097         else if (sk)
2098                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2099
2100         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2101 }
2102 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2103
2104 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105 {
2106         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2107         struct net_device *loopback_dev = net->loopback_dev;
2108         struct dst_entry *new = NULL;
2109
2110         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2111                        DST_OBSOLETE_DEAD, 0);
2112         if (rt) {
2113                 rt6_info_init(rt);
2114                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2115
2116                 new = &rt->dst;
2117                 new->__use = 1;
2118                 new->input = dst_discard;
2119                 new->output = dst_discard_out;
2120
2121                 dst_copy_metrics(new, &ort->dst);
2122
2123                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2124                 rt->rt6i_gateway = ort->rt6i_gateway;
2125                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2126
2127                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2128 #ifdef CONFIG_IPV6_SUBTREES
2129                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2130 #endif
2131         }
2132
2133         dst_release(dst_orig);
2134         return new ? new : ERR_PTR(-ENOMEM);
2135 }
2136
2137 /*
2138  *      Destination cache support functions
2139  */
2140
2141 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2142 {
2143         u32 rt_cookie = 0;
2144
2145         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2146                 return false;
2147
2148         if (fib6_check_expired(f6i))
2149                 return false;
2150
2151         return true;
2152 }
2153
2154 static struct dst_entry *rt6_check(struct rt6_info *rt,
2155                                    struct fib6_info *from,
2156                                    u32 cookie)
2157 {
2158         u32 rt_cookie = 0;
2159
2160         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2161             rt_cookie != cookie)
2162                 return NULL;
2163
2164         if (rt6_check_expired(rt))
2165                 return NULL;
2166
2167         return &rt->dst;
2168 }
2169
2170 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2171                                             struct fib6_info *from,
2172                                             u32 cookie)
2173 {
2174         if (!__rt6_check_expired(rt) &&
2175             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2176             fib6_check(from, cookie))
2177                 return &rt->dst;
2178         else
2179                 return NULL;
2180 }
2181
2182 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2183 {
2184         struct dst_entry *dst_ret;
2185         struct fib6_info *from;
2186         struct rt6_info *rt;
2187
2188         rt = container_of(dst, struct rt6_info, dst);
2189
2190         rcu_read_lock();
2191
2192         /* All IPV6 dsts are created with ->obsolete set to the value
2193          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2194          * into this function always.
2195          */
2196
2197         from = rcu_dereference(rt->from);
2198
2199         if (from && (rt->rt6i_flags & RTF_PCPU ||
2200             unlikely(!list_empty(&rt->rt6i_uncached))))
2201                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2202         else
2203                 dst_ret = rt6_check(rt, from, cookie);
2204
2205         rcu_read_unlock();
2206
2207         return dst_ret;
2208 }
2209
2210 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2211 {
2212         struct rt6_info *rt = (struct rt6_info *) dst;
2213
2214         if (rt) {
2215                 if (rt->rt6i_flags & RTF_CACHE) {
2216                         rcu_read_lock();
2217                         if (rt6_check_expired(rt)) {
2218                                 rt6_remove_exception_rt(rt);
2219                                 dst = NULL;
2220                         }
2221                         rcu_read_unlock();
2222                 } else {
2223                         dst_release(dst);
2224                         dst = NULL;
2225                 }
2226         }
2227         return dst;
2228 }
2229
2230 static void ip6_link_failure(struct sk_buff *skb)
2231 {
2232         struct rt6_info *rt;
2233
2234         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2235
2236         rt = (struct rt6_info *) skb_dst(skb);
2237         if (rt) {
2238                 rcu_read_lock();
2239                 if (rt->rt6i_flags & RTF_CACHE) {
2240                         rt6_remove_exception_rt(rt);
2241                 } else {
2242                         struct fib6_info *from;
2243                         struct fib6_node *fn;
2244
2245                         from = rcu_dereference(rt->from);
2246                         if (from) {
2247                                 fn = rcu_dereference(from->fib6_node);
2248                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2249                                         fn->fn_sernum = -1;
2250                         }
2251                 }
2252                 rcu_read_unlock();
2253         }
2254 }
2255
2256 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2257 {
2258         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2259                 struct fib6_info *from;
2260
2261                 rcu_read_lock();
2262                 from = rcu_dereference(rt0->from);
2263                 if (from)
2264                         rt0->dst.expires = from->expires;
2265                 rcu_read_unlock();
2266         }
2267
2268         dst_set_expires(&rt0->dst, timeout);
2269         rt0->rt6i_flags |= RTF_EXPIRES;
2270 }
2271
2272 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2273 {
2274         struct net *net = dev_net(rt->dst.dev);
2275
2276         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2277         rt->rt6i_flags |= RTF_MODIFIED;
2278         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2279 }
2280
2281 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2282 {
2283         return !(rt->rt6i_flags & RTF_CACHE) &&
2284                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2285 }
2286
2287 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2288                                  const struct ipv6hdr *iph, u32 mtu)
2289 {
2290         const struct in6_addr *daddr, *saddr;
2291         struct rt6_info *rt6 = (struct rt6_info *)dst;
2292
2293         if (dst_metric_locked(dst, RTAX_MTU))
2294                 return;
2295
2296         if (iph) {
2297                 daddr = &iph->daddr;
2298                 saddr = &iph->saddr;
2299         } else if (sk) {
2300                 daddr = &sk->sk_v6_daddr;
2301                 saddr = &inet6_sk(sk)->saddr;
2302         } else {
2303                 daddr = NULL;
2304                 saddr = NULL;
2305         }
2306         dst_confirm_neigh(dst, daddr);
2307         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2308         if (mtu >= dst_mtu(dst))
2309                 return;
2310
2311         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2312                 rt6_do_update_pmtu(rt6, mtu);
2313                 /* update rt6_ex->stamp for cache */
2314                 if (rt6->rt6i_flags & RTF_CACHE)
2315                         rt6_update_exception_stamp_rt(rt6);
2316         } else if (daddr) {
2317                 struct fib6_info *from;
2318                 struct rt6_info *nrt6;
2319
2320                 rcu_read_lock();
2321                 from = rcu_dereference(rt6->from);
2322                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2323                 if (nrt6) {
2324                         rt6_do_update_pmtu(nrt6, mtu);
2325                         if (rt6_insert_exception(nrt6, from))
2326                                 dst_release_immediate(&nrt6->dst);
2327                 }
2328                 rcu_read_unlock();
2329         }
2330 }
2331
2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333                                struct sk_buff *skb, u32 mtu)
2334 {
2335         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2336 }
2337
2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2339                      int oif, u32 mark, kuid_t uid)
2340 {
2341         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2342         struct dst_entry *dst;
2343         struct flowi6 fl6 = {
2344                 .flowi6_oif = oif,
2345                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2346                 .daddr = iph->daddr,
2347                 .saddr = iph->saddr,
2348                 .flowlabel = ip6_flowinfo(iph),
2349                 .flowi6_uid = uid,
2350         };
2351
2352         dst = ip6_route_output(net, NULL, &fl6);
2353         if (!dst->error)
2354                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2355         dst_release(dst);
2356 }
2357 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2358
2359 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2360 {
2361         int oif = sk->sk_bound_dev_if;
2362         struct dst_entry *dst;
2363
2364         if (!oif && skb->dev)
2365                 oif = l3mdev_master_ifindex(skb->dev);
2366
2367         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2368
2369         dst = __sk_dst_get(sk);
2370         if (!dst || !dst->obsolete ||
2371             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2372                 return;
2373
2374         bh_lock_sock(sk);
2375         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2376                 ip6_datagram_dst_update(sk, false);
2377         bh_unlock_sock(sk);
2378 }
2379 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2380
2381 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2382                            const struct flowi6 *fl6)
2383 {
2384 #ifdef CONFIG_IPV6_SUBTREES
2385         struct ipv6_pinfo *np = inet6_sk(sk);
2386 #endif
2387
2388         ip6_dst_store(sk, dst,
2389                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2390                       &sk->sk_v6_daddr : NULL,
2391 #ifdef CONFIG_IPV6_SUBTREES
2392                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2393                       &np->saddr :
2394 #endif
2395                       NULL);
2396 }
2397
2398 /* Handle redirects */
2399 struct ip6rd_flowi {
2400         struct flowi6 fl6;
2401         struct in6_addr gateway;
2402 };
2403
2404 static struct rt6_info *__ip6_route_redirect(struct net *net,
2405                                              struct fib6_table *table,
2406                                              struct flowi6 *fl6,
2407                                              const struct sk_buff *skb,
2408                                              int flags)
2409 {
2410         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2411         struct rt6_info *ret = NULL, *rt_cache;
2412         struct fib6_info *rt;
2413         struct fib6_node *fn;
2414
2415         /* Get the "current" route for this destination and
2416          * check if the redirect has come from appropriate router.
2417          *
2418          * RFC 4861 specifies that redirects should only be
2419          * accepted if they come from the nexthop to the target.
2420          * Due to the way the routes are chosen, this notion
2421          * is a bit fuzzy and one might need to check all possible
2422          * routes.
2423          */
2424
2425         rcu_read_lock();
2426         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2427 restart:
2428         for_each_fib6_node_rt_rcu(fn) {
2429                 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
2430                         continue;
2431                 if (fib6_check_expired(rt))
2432                         continue;
2433                 if (rt->fib6_flags & RTF_REJECT)
2434                         break;
2435                 if (!rt->fib6_nh.fib_nh_gw_family)
2436                         continue;
2437                 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2438                         continue;
2439                 /* rt_cache's gateway might be different from its 'parent'
2440                  * in the case of an ip redirect.
2441                  * So we keep searching in the exception table if the gateway
2442                  * is different.
2443                  */
2444                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
2445                         rt_cache = rt6_find_cached_rt(rt,
2446                                                       &fl6->daddr,
2447                                                       &fl6->saddr);
2448                         if (rt_cache &&
2449                             ipv6_addr_equal(&rdfl->gateway,
2450                                             &rt_cache->rt6i_gateway)) {
2451                                 ret = rt_cache;
2452                                 break;
2453                         }
2454                         continue;
2455                 }
2456                 break;
2457         }
2458
2459         if (!rt)
2460                 rt = net->ipv6.fib6_null_entry;
2461         else if (rt->fib6_flags & RTF_REJECT) {
2462                 ret = net->ipv6.ip6_null_entry;
2463                 goto out;
2464         }
2465
2466         if (rt == net->ipv6.fib6_null_entry) {
2467                 fn = fib6_backtrack(fn, &fl6->saddr);
2468                 if (fn)
2469                         goto restart;
2470         }
2471
2472 out:
2473         if (ret)
2474                 ip6_hold_safe(net, &ret);
2475         else
2476                 ret = ip6_create_rt_rcu(rt);
2477
2478         rcu_read_unlock();
2479
2480         trace_fib6_table_lookup(net, rt, table, fl6);
2481         return ret;
2482 };
2483
2484 static struct dst_entry *ip6_route_redirect(struct net *net,
2485                                             const struct flowi6 *fl6,
2486                                             const struct sk_buff *skb,
2487                                             const struct in6_addr *gateway)
2488 {
2489         int flags = RT6_LOOKUP_F_HAS_SADDR;
2490         struct ip6rd_flowi rdfl;
2491
2492         rdfl.fl6 = *fl6;
2493         rdfl.gateway = *gateway;
2494
2495         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2496                                 flags, __ip6_route_redirect);
2497 }
2498
2499 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2500                   kuid_t uid)
2501 {
2502         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2503         struct dst_entry *dst;
2504         struct flowi6 fl6 = {
2505                 .flowi6_iif = LOOPBACK_IFINDEX,
2506                 .flowi6_oif = oif,
2507                 .flowi6_mark = mark,
2508                 .daddr = iph->daddr,
2509                 .saddr = iph->saddr,
2510                 .flowlabel = ip6_flowinfo(iph),
2511                 .flowi6_uid = uid,
2512         };
2513
2514         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2515         rt6_do_redirect(dst, NULL, skb);
2516         dst_release(dst);
2517 }
2518 EXPORT_SYMBOL_GPL(ip6_redirect);
2519
2520 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2521 {
2522         const struct ipv6hdr *iph = ipv6_hdr(skb);
2523         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2524         struct dst_entry *dst;
2525         struct flowi6 fl6 = {
2526                 .flowi6_iif = LOOPBACK_IFINDEX,
2527                 .flowi6_oif = oif,
2528                 .daddr = msg->dest,
2529                 .saddr = iph->daddr,
2530                 .flowi6_uid = sock_net_uid(net, NULL),
2531         };
2532
2533         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2534         rt6_do_redirect(dst, NULL, skb);
2535         dst_release(dst);
2536 }
2537
2538 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2539 {
2540         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2541                      sk->sk_uid);
2542 }
2543 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2544
2545 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2546 {
2547         struct net_device *dev = dst->dev;
2548         unsigned int mtu = dst_mtu(dst);
2549         struct net *net = dev_net(dev);
2550
2551         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2552
2553         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2554                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2555
2556         /*
2557          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2558          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2559          * IPV6_MAXPLEN is also valid and means: "any MSS,
2560          * rely only on pmtu discovery"
2561          */
2562         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2563                 mtu = IPV6_MAXPLEN;
2564         return mtu;
2565 }
2566
2567 static unsigned int ip6_mtu(const struct dst_entry *dst)
2568 {
2569         struct inet6_dev *idev;
2570         unsigned int mtu;
2571
2572         mtu = dst_metric_raw(dst, RTAX_MTU);
2573         if (mtu)
2574                 goto out;
2575
2576         mtu = IPV6_MIN_MTU;
2577
2578         rcu_read_lock();
2579         idev = __in6_dev_get(dst->dev);
2580         if (idev)
2581                 mtu = idev->cnf.mtu6;
2582         rcu_read_unlock();
2583
2584 out:
2585         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2586
2587         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2588 }
2589
2590 /* MTU selection:
2591  * 1. mtu on route is locked - use it
2592  * 2. mtu from nexthop exception
2593  * 3. mtu from egress device
2594  *
2595  * based on ip6_dst_mtu_forward and exception logic of
2596  * rt6_find_cached_rt; called with rcu_read_lock
2597  */
2598 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2599                       struct in6_addr *saddr)
2600 {
2601         struct rt6_exception_bucket *bucket;
2602         struct rt6_exception *rt6_ex;
2603         struct in6_addr *src_key;
2604         struct inet6_dev *idev;
2605         u32 mtu = 0;
2606
2607         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2608                 mtu = f6i->fib6_pmtu;
2609                 if (mtu)
2610                         goto out;
2611         }
2612
2613         src_key = NULL;
2614 #ifdef CONFIG_IPV6_SUBTREES
2615         if (f6i->fib6_src.plen)
2616                 src_key = saddr;
2617 #endif
2618
2619         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2620         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2621         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2622                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2623
2624         if (likely(!mtu)) {
2625                 struct net_device *dev = fib6_info_nh_dev(f6i);
2626
2627                 mtu = IPV6_MIN_MTU;
2628                 idev = __in6_dev_get(dev);
2629                 if (idev && idev->cnf.mtu6 > mtu)
2630                         mtu = idev->cnf.mtu6;
2631         }
2632
2633         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2634 out:
2635         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2636 }
2637
2638 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2639                                   struct flowi6 *fl6)
2640 {
2641         struct dst_entry *dst;
2642         struct rt6_info *rt;
2643         struct inet6_dev *idev = in6_dev_get(dev);
2644         struct net *net = dev_net(dev);
2645
2646         if (unlikely(!idev))
2647                 return ERR_PTR(-ENODEV);
2648
2649         rt = ip6_dst_alloc(net, dev, 0);
2650         if (unlikely(!rt)) {
2651                 in6_dev_put(idev);
2652                 dst = ERR_PTR(-ENOMEM);
2653                 goto out;
2654         }
2655
2656         rt->dst.flags |= DST_HOST;
2657         rt->dst.input = ip6_input;
2658         rt->dst.output  = ip6_output;
2659         rt->rt6i_gateway  = fl6->daddr;
2660         rt->rt6i_dst.addr = fl6->daddr;
2661         rt->rt6i_dst.plen = 128;
2662         rt->rt6i_idev     = idev;
2663         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2664
2665         /* Add this dst into uncached_list so that rt6_disable_ip() can
2666          * do proper release of the net_device
2667          */
2668         rt6_uncached_list_add(rt);
2669         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2670
2671         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2672
2673 out:
2674         return dst;
2675 }
2676
2677 static int ip6_dst_gc(struct dst_ops *ops)
2678 {
2679         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2680         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2681         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2682         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2683         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2684         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2685         int entries;
2686
2687         entries = dst_entries_get_fast(ops);
2688         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2689             entries <= rt_max_size)
2690                 goto out;
2691
2692         net->ipv6.ip6_rt_gc_expire++;
2693         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2694         entries = dst_entries_get_slow(ops);
2695         if (entries < ops->gc_thresh)
2696                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2697 out:
2698         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2699         return entries > rt_max_size;
2700 }
2701
2702 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2703                                             struct fib6_config *cfg,
2704                                             const struct in6_addr *gw_addr,
2705                                             u32 tbid, int flags)
2706 {
2707         struct flowi6 fl6 = {
2708                 .flowi6_oif = cfg->fc_ifindex,
2709                 .daddr = *gw_addr,
2710                 .saddr = cfg->fc_prefsrc,
2711         };
2712         struct fib6_table *table;
2713         struct rt6_info *rt;
2714
2715         table = fib6_get_table(net, tbid);
2716         if (!table)
2717                 return NULL;
2718
2719         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2720                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2721
2722         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2723         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2724
2725         /* if table lookup failed, fall back to full lookup */
2726         if (rt == net->ipv6.ip6_null_entry) {
2727                 ip6_rt_put(rt);
2728                 rt = NULL;
2729         }
2730
2731         return rt;
2732 }
2733
2734 static int ip6_route_check_nh_onlink(struct net *net,
2735                                      struct fib6_config *cfg,
2736                                      const struct net_device *dev,
2737                                      struct netlink_ext_ack *extack)
2738 {
2739         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2740         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2741         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2742         struct fib6_info *from;
2743         struct rt6_info *grt;
2744         int err;
2745
2746         err = 0;
2747         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2748         if (grt) {
2749                 rcu_read_lock();
2750                 from = rcu_dereference(grt->from);
2751                 if (!grt->dst.error &&
2752                     /* ignore match if it is the default route */
2753                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2754                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2755                         NL_SET_ERR_MSG(extack,
2756                                        "Nexthop has invalid gateway or device mismatch");
2757                         err = -EINVAL;
2758                 }
2759                 rcu_read_unlock();
2760
2761                 ip6_rt_put(grt);
2762         }
2763
2764         return err;
2765 }
2766
2767 static int ip6_route_check_nh(struct net *net,
2768                               struct fib6_config *cfg,
2769                               struct net_device **_dev,
2770                               struct inet6_dev **idev)
2771 {
2772         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2773         struct net_device *dev = _dev ? *_dev : NULL;
2774         struct rt6_info *grt = NULL;
2775         int err = -EHOSTUNREACH;
2776
2777         if (cfg->fc_table) {
2778                 int flags = RT6_LOOKUP_F_IFACE;
2779
2780                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2781                                           cfg->fc_table, flags);
2782                 if (grt) {
2783                         if (grt->rt6i_flags & RTF_GATEWAY ||
2784                             (dev && dev != grt->dst.dev)) {
2785                                 ip6_rt_put(grt);
2786                                 grt = NULL;
2787                         }
2788                 }
2789         }
2790
2791         if (!grt)
2792                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2793
2794         if (!grt)
2795                 goto out;
2796
2797         if (dev) {
2798                 if (dev != grt->dst.dev) {
2799                         ip6_rt_put(grt);
2800                         goto out;
2801                 }
2802         } else {
2803                 *_dev = dev = grt->dst.dev;
2804                 *idev = grt->rt6i_idev;
2805                 dev_hold(dev);
2806                 in6_dev_hold(grt->rt6i_idev);
2807         }
2808
2809         if (!(grt->rt6i_flags & RTF_GATEWAY))
2810                 err = 0;
2811
2812         ip6_rt_put(grt);
2813
2814 out:
2815         return err;
2816 }
2817
2818 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2819                            struct net_device **_dev, struct inet6_dev **idev,
2820                            struct netlink_ext_ack *extack)
2821 {
2822         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2823         int gwa_type = ipv6_addr_type(gw_addr);
2824         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2825         const struct net_device *dev = *_dev;
2826         bool need_addr_check = !dev;
2827         int err = -EINVAL;
2828
2829         /* if gw_addr is local we will fail to detect this in case
2830          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2831          * will return already-added prefix route via interface that
2832          * prefix route was assigned to, which might be non-loopback.
2833          */
2834         if (dev &&
2835             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2836                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2837                 goto out;
2838         }
2839
2840         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2841                 /* IPv6 strictly inhibits using not link-local
2842                  * addresses as nexthop address.
2843                  * Otherwise, router will not able to send redirects.
2844                  * It is very good, but in some (rare!) circumstances
2845                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2846                  * some exceptions. --ANK
2847                  * We allow IPv4-mapped nexthops to support RFC4798-type
2848                  * addressing
2849                  */
2850                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2851                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2852                         goto out;
2853                 }
2854
2855                 if (cfg->fc_flags & RTNH_F_ONLINK)
2856                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2857                 else
2858                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2859
2860                 if (err)
2861                         goto out;
2862         }
2863
2864         /* reload in case device was changed */
2865         dev = *_dev;
2866
2867         err = -EINVAL;
2868         if (!dev) {
2869                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2870                 goto out;
2871         } else if (dev->flags & IFF_LOOPBACK) {
2872                 NL_SET_ERR_MSG(extack,
2873                                "Egress device can not be loopback device for this route");
2874                 goto out;
2875         }
2876
2877         /* if we did not check gw_addr above, do so now that the
2878          * egress device has been resolved.
2879          */
2880         if (need_addr_check &&
2881             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2882                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2883                 goto out;
2884         }
2885
2886         err = 0;
2887 out:
2888         return err;
2889 }
2890
2891 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2892 {
2893         if ((flags & RTF_REJECT) ||
2894             (dev && (dev->flags & IFF_LOOPBACK) &&
2895              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2896              !(flags & RTF_LOCAL)))
2897                 return true;
2898
2899         return false;
2900 }
2901
2902 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2903                  struct fib6_config *cfg, gfp_t gfp_flags,
2904                  struct netlink_ext_ack *extack)
2905 {
2906         struct net_device *dev = NULL;
2907         struct inet6_dev *idev = NULL;
2908         int addr_type;
2909         int err;
2910
2911         fib6_nh->fib_nh_family = AF_INET6;
2912
2913         err = -ENODEV;
2914         if (cfg->fc_ifindex) {
2915                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2916                 if (!dev)
2917                         goto out;
2918                 idev = in6_dev_get(dev);
2919                 if (!idev)
2920                         goto out;
2921         }
2922
2923         if (cfg->fc_flags & RTNH_F_ONLINK) {
2924                 if (!dev) {
2925                         NL_SET_ERR_MSG(extack,
2926                                        "Nexthop device required for onlink");
2927                         goto out;
2928                 }
2929
2930                 if (!(dev->flags & IFF_UP)) {
2931                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2932                         err = -ENETDOWN;
2933                         goto out;
2934                 }
2935
2936                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2937         }
2938
2939         fib6_nh->fib_nh_weight = 1;
2940
2941         /* We cannot add true routes via loopback here,
2942          * they would result in kernel looping; promote them to reject routes
2943          */
2944         addr_type = ipv6_addr_type(&cfg->fc_dst);
2945         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2946                 /* hold loopback dev/idev if we haven't done so. */
2947                 if (dev != net->loopback_dev) {
2948                         if (dev) {
2949                                 dev_put(dev);
2950                                 in6_dev_put(idev);
2951                         }
2952                         dev = net->loopback_dev;
2953                         dev_hold(dev);
2954                         idev = in6_dev_get(dev);
2955                         if (!idev) {
2956                                 err = -ENODEV;
2957                                 goto out;
2958                         }
2959                 }
2960                 goto set_dev;
2961         }
2962
2963         if (cfg->fc_flags & RTF_GATEWAY) {
2964                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2965                 if (err)
2966                         goto out;
2967
2968                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2969                 fib6_nh->fib_nh_gw_family = AF_INET6;
2970         }
2971
2972         err = -ENODEV;
2973         if (!dev)
2974                 goto out;
2975
2976         if (idev->cnf.disable_ipv6) {
2977                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2978                 err = -EACCES;
2979                 goto out;
2980         }
2981
2982         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
2983                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2984                 err = -ENETDOWN;
2985                 goto out;
2986         }
2987
2988         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2989             !netif_carrier_ok(dev))
2990                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
2991
2992         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
2993                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
2994         if (err)
2995                 goto out;
2996 set_dev:
2997         fib6_nh->fib_nh_dev = dev;
2998         fib6_nh->fib_nh_oif = dev->ifindex;
2999         err = 0;
3000 out:
3001         if (idev)
3002                 in6_dev_put(idev);
3003
3004         if (err) {
3005                 lwtstate_put(fib6_nh->fib_nh_lws);
3006                 fib6_nh->fib_nh_lws = NULL;
3007                 if (dev)
3008                         dev_put(dev);
3009         }
3010
3011         return err;
3012 }
3013
3014 void fib6_nh_release(struct fib6_nh *fib6_nh)
3015 {
3016         fib_nh_common_release(&fib6_nh->nh_common);
3017 }
3018
3019 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3020                                               gfp_t gfp_flags,
3021                                               struct netlink_ext_ack *extack)
3022 {
3023         struct net *net = cfg->fc_nlinfo.nl_net;
3024         struct fib6_info *rt = NULL;
3025         struct fib6_table *table;
3026         int err = -EINVAL;
3027         int addr_type;
3028
3029         /* RTF_PCPU is an internal flag; can not be set by userspace */
3030         if (cfg->fc_flags & RTF_PCPU) {
3031                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3032                 goto out;
3033         }
3034
3035         /* RTF_CACHE is an internal flag; can not be set by userspace */
3036         if (cfg->fc_flags & RTF_CACHE) {
3037                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3038                 goto out;
3039         }
3040
3041         if (cfg->fc_type > RTN_MAX) {
3042                 NL_SET_ERR_MSG(extack, "Invalid route type");
3043                 goto out;
3044         }
3045
3046         if (cfg->fc_dst_len > 128) {
3047                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3048                 goto out;
3049         }
3050         if (cfg->fc_src_len > 128) {
3051                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3052                 goto out;
3053         }
3054 #ifndef CONFIG_IPV6_SUBTREES
3055         if (cfg->fc_src_len) {
3056                 NL_SET_ERR_MSG(extack,
3057                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3058                 goto out;
3059         }
3060 #endif
3061
3062         err = -ENOBUFS;
3063         if (cfg->fc_nlinfo.nlh &&
3064             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3065                 table = fib6_get_table(net, cfg->fc_table);
3066                 if (!table) {
3067                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3068                         table = fib6_new_table(net, cfg->fc_table);
3069                 }
3070         } else {
3071                 table = fib6_new_table(net, cfg->fc_table);
3072         }
3073
3074         if (!table)
3075                 goto out;
3076
3077         err = -ENOMEM;
3078         rt = fib6_info_alloc(gfp_flags);
3079         if (!rt)
3080                 goto out;
3081
3082         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3083                                                extack);
3084         if (IS_ERR(rt->fib6_metrics)) {
3085                 err = PTR_ERR(rt->fib6_metrics);
3086                 /* Do not leave garbage there. */
3087                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3088                 goto out;
3089         }
3090
3091         if (cfg->fc_flags & RTF_ADDRCONF)
3092                 rt->dst_nocount = true;
3093
3094         if (cfg->fc_flags & RTF_EXPIRES)
3095                 fib6_set_expires(rt, jiffies +
3096                                 clock_t_to_jiffies(cfg->fc_expires));
3097         else
3098                 fib6_clean_expires(rt);
3099
3100         if (cfg->fc_protocol == RTPROT_UNSPEC)
3101                 cfg->fc_protocol = RTPROT_BOOT;
3102         rt->fib6_protocol = cfg->fc_protocol;
3103
3104         rt->fib6_table = table;
3105         rt->fib6_metric = cfg->fc_metric;
3106         rt->fib6_type = cfg->fc_type;
3107         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3108
3109         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3110         rt->fib6_dst.plen = cfg->fc_dst_len;
3111         if (rt->fib6_dst.plen == 128)
3112                 rt->dst_host = true;
3113
3114 #ifdef CONFIG_IPV6_SUBTREES
3115         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3116         rt->fib6_src.plen = cfg->fc_src_len;
3117 #endif
3118         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3119         if (err)
3120                 goto out;
3121
3122         /* We cannot add true routes via loopback here,
3123          * they would result in kernel looping; promote them to reject routes
3124          */
3125         addr_type = ipv6_addr_type(&cfg->fc_dst);
3126         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3127                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3128
3129         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3130                 struct net_device *dev = fib6_info_nh_dev(rt);
3131
3132                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3133                         NL_SET_ERR_MSG(extack, "Invalid source address");
3134                         err = -EINVAL;
3135                         goto out;
3136                 }
3137                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3138                 rt->fib6_prefsrc.plen = 128;
3139         } else
3140                 rt->fib6_prefsrc.plen = 0;
3141
3142         return rt;
3143 out:
3144         fib6_info_release(rt);
3145         return ERR_PTR(err);
3146 }
3147
3148 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3149                   struct netlink_ext_ack *extack)
3150 {
3151         struct fib6_info *rt;
3152         int err;
3153
3154         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3155         if (IS_ERR(rt))
3156                 return PTR_ERR(rt);
3157
3158         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3159         fib6_info_release(rt);
3160
3161         return err;
3162 }
3163
3164 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3165 {
3166         struct net *net = info->nl_net;
3167         struct fib6_table *table;
3168         int err;
3169
3170         if (rt == net->ipv6.fib6_null_entry) {
3171                 err = -ENOENT;
3172                 goto out;
3173         }
3174
3175         table = rt->fib6_table;
3176         spin_lock_bh(&table->tb6_lock);
3177         err = fib6_del(rt, info);
3178         spin_unlock_bh(&table->tb6_lock);
3179
3180 out:
3181         fib6_info_release(rt);
3182         return err;
3183 }
3184
3185 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3186 {
3187         struct nl_info info = { .nl_net = net };
3188
3189         return __ip6_del_rt(rt, &info);
3190 }
3191
3192 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3193 {
3194         struct nl_info *info = &cfg->fc_nlinfo;
3195         struct net *net = info->nl_net;
3196         struct sk_buff *skb = NULL;
3197         struct fib6_table *table;
3198         int err = -ENOENT;
3199
3200         if (rt == net->ipv6.fib6_null_entry)
3201                 goto out_put;
3202         table = rt->fib6_table;
3203         spin_lock_bh(&table->tb6_lock);
3204
3205         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3206                 struct fib6_info *sibling, *next_sibling;
3207
3208                 /* prefer to send a single notification with all hops */
3209                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3210                 if (skb) {
3211                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3212
3213                         if (rt6_fill_node(net, skb, rt, NULL,
3214                                           NULL, NULL, 0, RTM_DELROUTE,
3215                                           info->portid, seq, 0) < 0) {
3216                                 kfree_skb(skb);
3217                                 skb = NULL;
3218                         } else
3219                                 info->skip_notify = 1;
3220                 }
3221
3222                 list_for_each_entry_safe(sibling, next_sibling,
3223                                          &rt->fib6_siblings,
3224                                          fib6_siblings) {
3225                         err = fib6_del(sibling, info);
3226                         if (err)
3227                                 goto out_unlock;
3228                 }
3229         }
3230
3231         err = fib6_del(rt, info);
3232 out_unlock:
3233         spin_unlock_bh(&table->tb6_lock);
3234 out_put:
3235         fib6_info_release(rt);
3236
3237         if (skb) {
3238                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3239                             info->nlh, gfp_any());
3240         }
3241         return err;
3242 }
3243
3244 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3245 {
3246         int rc = -ESRCH;
3247
3248         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3249                 goto out;
3250
3251         if (cfg->fc_flags & RTF_GATEWAY &&
3252             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3253                 goto out;
3254
3255         rc = rt6_remove_exception_rt(rt);
3256 out:
3257         return rc;
3258 }
3259
3260 static int ip6_route_del(struct fib6_config *cfg,
3261                          struct netlink_ext_ack *extack)
3262 {
3263         struct rt6_info *rt_cache;
3264         struct fib6_table *table;
3265         struct fib6_info *rt;
3266         struct fib6_node *fn;
3267         int err = -ESRCH;
3268
3269         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3270         if (!table) {
3271                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3272                 return err;
3273         }
3274
3275         rcu_read_lock();
3276
3277         fn = fib6_locate(&table->tb6_root,
3278                          &cfg->fc_dst, cfg->fc_dst_len,
3279                          &cfg->fc_src, cfg->fc_src_len,
3280                          !(cfg->fc_flags & RTF_CACHE));
3281
3282         if (fn) {
3283                 for_each_fib6_node_rt_rcu(fn) {
3284                         struct fib6_nh *nh;
3285
3286                         if (cfg->fc_flags & RTF_CACHE) {
3287                                 int rc;
3288
3289                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3290                                                               &cfg->fc_src);
3291                                 if (rt_cache) {
3292                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3293                                         if (rc != -ESRCH) {
3294                                                 rcu_read_unlock();
3295                                                 return rc;
3296                                         }
3297                                 }
3298                                 continue;
3299                         }
3300
3301                         nh = &rt->fib6_nh;
3302                         if (cfg->fc_ifindex &&
3303                             (!nh->fib_nh_dev ||
3304                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3305                                 continue;
3306                         if (cfg->fc_flags & RTF_GATEWAY &&
3307                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3308                                 continue;
3309                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3310                                 continue;
3311                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3312                                 continue;
3313                         if (!fib6_info_hold_safe(rt))
3314                                 continue;
3315                         rcu_read_unlock();
3316
3317                         /* if gateway was specified only delete the one hop */
3318                         if (cfg->fc_flags & RTF_GATEWAY)
3319                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3320
3321                         return __ip6_del_rt_siblings(rt, cfg);
3322                 }
3323         }
3324         rcu_read_unlock();
3325
3326         return err;
3327 }
3328
3329 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3330 {
3331         struct netevent_redirect netevent;
3332         struct rt6_info *rt, *nrt = NULL;
3333         struct ndisc_options ndopts;
3334         struct inet6_dev *in6_dev;
3335         struct neighbour *neigh;
3336         struct fib6_info *from;
3337         struct rd_msg *msg;
3338         int optlen, on_link;
3339         u8 *lladdr;
3340
3341         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3342         optlen -= sizeof(*msg);
3343
3344         if (optlen < 0) {
3345                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3346                 return;
3347         }
3348
3349         msg = (struct rd_msg *)icmp6_hdr(skb);
3350
3351         if (ipv6_addr_is_multicast(&msg->dest)) {
3352                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3353                 return;
3354         }
3355
3356         on_link = 0;
3357         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3358                 on_link = 1;
3359         } else if (ipv6_addr_type(&msg->target) !=
3360                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3361                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3362                 return;
3363         }
3364
3365         in6_dev = __in6_dev_get(skb->dev);
3366         if (!in6_dev)
3367                 return;
3368         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3369                 return;
3370
3371         /* RFC2461 8.1:
3372          *      The IP source address of the Redirect MUST be the same as the current
3373          *      first-hop router for the specified ICMP Destination Address.
3374          */
3375
3376         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3377                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3378                 return;
3379         }
3380
3381         lladdr = NULL;
3382         if (ndopts.nd_opts_tgt_lladdr) {
3383                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3384                                              skb->dev);
3385                 if (!lladdr) {
3386                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3387                         return;
3388                 }
3389         }
3390
3391         rt = (struct rt6_info *) dst;
3392         if (rt->rt6i_flags & RTF_REJECT) {
3393                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3394                 return;
3395         }
3396
3397         /* Redirect received -> path was valid.
3398          * Look, redirects are sent only in response to data packets,
3399          * so that this nexthop apparently is reachable. --ANK
3400          */
3401         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3402
3403         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3404         if (!neigh)
3405                 return;
3406
3407         /*
3408          *      We have finally decided to accept it.
3409          */
3410
3411         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3412                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3413                      NEIGH_UPDATE_F_OVERRIDE|
3414                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3415                                      NEIGH_UPDATE_F_ISROUTER)),
3416                      NDISC_REDIRECT, &ndopts);
3417
3418         rcu_read_lock();
3419         from = rcu_dereference(rt->from);
3420         /* This fib6_info_hold() is safe here because we hold reference to rt
3421          * and rt already holds reference to fib6_info.
3422          */
3423         fib6_info_hold(from);
3424         rcu_read_unlock();
3425
3426         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3427         if (!nrt)
3428                 goto out;
3429
3430         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3431         if (on_link)
3432                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3433
3434         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3435
3436         /* No need to remove rt from the exception table if rt is
3437          * a cached route because rt6_insert_exception() will
3438          * takes care of it
3439          */
3440         if (rt6_insert_exception(nrt, from)) {
3441                 dst_release_immediate(&nrt->dst);
3442                 goto out;
3443         }
3444
3445         netevent.old = &rt->dst;
3446         netevent.new = &nrt->dst;
3447         netevent.daddr = &msg->dest;
3448         netevent.neigh = neigh;
3449         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3450
3451 out:
3452         fib6_info_release(from);
3453         neigh_release(neigh);
3454 }
3455
3456 #ifdef CONFIG_IPV6_ROUTE_INFO
3457 static struct fib6_info *rt6_get_route_info(struct net *net,
3458                                            const struct in6_addr *prefix, int prefixlen,
3459                                            const struct in6_addr *gwaddr,
3460                                            struct net_device *dev)
3461 {
3462         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3463         int ifindex = dev->ifindex;
3464         struct fib6_node *fn;
3465         struct fib6_info *rt = NULL;
3466         struct fib6_table *table;
3467
3468         table = fib6_get_table(net, tb_id);
3469         if (!table)
3470                 return NULL;
3471
3472         rcu_read_lock();
3473         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3474         if (!fn)
3475                 goto out;
3476
3477         for_each_fib6_node_rt_rcu(fn) {
3478                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3479                         continue;
3480                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3481                     !rt->fib6_nh.fib_nh_gw_family)
3482                         continue;
3483                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3484                         continue;
3485                 if (!fib6_info_hold_safe(rt))
3486                         continue;
3487                 break;
3488         }
3489 out:
3490         rcu_read_unlock();
3491         return rt;
3492 }
3493
3494 static struct fib6_info *rt6_add_route_info(struct net *net,
3495                                            const struct in6_addr *prefix, int prefixlen,
3496                                            const struct in6_addr *gwaddr,
3497                                            struct net_device *dev,
3498                                            unsigned int pref)
3499 {
3500         struct fib6_config cfg = {
3501                 .fc_metric      = IP6_RT_PRIO_USER,
3502                 .fc_ifindex     = dev->ifindex,
3503                 .fc_dst_len     = prefixlen,
3504                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3505                                   RTF_UP | RTF_PREF(pref),
3506                 .fc_protocol = RTPROT_RA,
3507                 .fc_type = RTN_UNICAST,
3508                 .fc_nlinfo.portid = 0,
3509                 .fc_nlinfo.nlh = NULL,
3510                 .fc_nlinfo.nl_net = net,
3511         };
3512
3513         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3514         cfg.fc_dst = *prefix;
3515         cfg.fc_gateway = *gwaddr;
3516
3517         /* We should treat it as a default route if prefix length is 0. */
3518         if (!prefixlen)
3519                 cfg.fc_flags |= RTF_DEFAULT;
3520
3521         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3522
3523         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3524 }
3525 #endif
3526
3527 struct fib6_info *rt6_get_dflt_router(struct net *net,
3528                                      const struct in6_addr *addr,
3529                                      struct net_device *dev)
3530 {
3531         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3532         struct fib6_info *rt;
3533         struct fib6_table *table;
3534
3535         table = fib6_get_table(net, tb_id);
3536         if (!table)
3537                 return NULL;
3538
3539         rcu_read_lock();
3540         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3541                 struct fib6_nh *nh = &rt->fib6_nh;
3542
3543                 if (dev == nh->fib_nh_dev &&
3544                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3545                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3546                         break;
3547         }
3548         if (rt && !fib6_info_hold_safe(rt))
3549                 rt = NULL;
3550         rcu_read_unlock();
3551         return rt;
3552 }
3553
3554 struct fib6_info *rt6_add_dflt_router(struct net *net,
3555                                      const struct in6_addr *gwaddr,
3556                                      struct net_device *dev,
3557                                      unsigned int pref)
3558 {
3559         struct fib6_config cfg = {
3560                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3561                 .fc_metric      = IP6_RT_PRIO_USER,
3562                 .fc_ifindex     = dev->ifindex,
3563                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3564                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3565                 .fc_protocol = RTPROT_RA,
3566                 .fc_type = RTN_UNICAST,
3567                 .fc_nlinfo.portid = 0,
3568                 .fc_nlinfo.nlh = NULL,
3569                 .fc_nlinfo.nl_net = net,
3570         };
3571
3572         cfg.fc_gateway = *gwaddr;
3573
3574         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3575                 struct fib6_table *table;
3576
3577                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3578                 if (table)
3579                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3580         }
3581
3582         return rt6_get_dflt_router(net, gwaddr, dev);
3583 }
3584
3585 static void __rt6_purge_dflt_routers(struct net *net,
3586                                      struct fib6_table *table)
3587 {
3588         struct fib6_info *rt;
3589
3590 restart:
3591         rcu_read_lock();
3592         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3593                 struct net_device *dev = fib6_info_nh_dev(rt);
3594                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3595
3596                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3597                     (!idev || idev->cnf.accept_ra != 2) &&
3598                     fib6_info_hold_safe(rt)) {
3599                         rcu_read_unlock();
3600                         ip6_del_rt(net, rt);
3601                         goto restart;
3602                 }
3603         }
3604         rcu_read_unlock();
3605
3606         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3607 }
3608
3609 void rt6_purge_dflt_routers(struct net *net)
3610 {
3611         struct fib6_table *table;
3612         struct hlist_head *head;
3613         unsigned int h;
3614
3615         rcu_read_lock();
3616
3617         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3618                 head = &net->ipv6.fib_table_hash[h];
3619                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3620                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3621                                 __rt6_purge_dflt_routers(net, table);
3622                 }
3623         }
3624
3625         rcu_read_unlock();
3626 }
3627
3628 static void rtmsg_to_fib6_config(struct net *net,
3629                                  struct in6_rtmsg *rtmsg,
3630                                  struct fib6_config *cfg)
3631 {
3632         *cfg = (struct fib6_config){
3633                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3634                          : RT6_TABLE_MAIN,
3635                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3636                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3637                 .fc_expires = rtmsg->rtmsg_info,
3638                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3639                 .fc_src_len = rtmsg->rtmsg_src_len,
3640                 .fc_flags = rtmsg->rtmsg_flags,
3641                 .fc_type = rtmsg->rtmsg_type,
3642
3643                 .fc_nlinfo.nl_net = net,
3644
3645                 .fc_dst = rtmsg->rtmsg_dst,
3646                 .fc_src = rtmsg->rtmsg_src,
3647                 .fc_gateway = rtmsg->rtmsg_gateway,
3648         };
3649 }
3650
3651 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3652 {
3653         struct fib6_config cfg;
3654         struct in6_rtmsg rtmsg;
3655         int err;
3656
3657         switch (cmd) {
3658         case SIOCADDRT:         /* Add a route */
3659         case SIOCDELRT:         /* Delete a route */
3660                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3661                         return -EPERM;
3662                 err = copy_from_user(&rtmsg, arg,
3663                                      sizeof(struct in6_rtmsg));
3664                 if (err)
3665                         return -EFAULT;
3666
3667                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3668
3669                 rtnl_lock();
3670                 switch (cmd) {
3671                 case SIOCADDRT:
3672                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3673                         break;
3674                 case SIOCDELRT:
3675                         err = ip6_route_del(&cfg, NULL);
3676                         break;
3677                 default:
3678                         err = -EINVAL;
3679                 }
3680                 rtnl_unlock();
3681
3682                 return err;
3683         }
3684
3685         return -EINVAL;
3686 }
3687
3688 /*
3689  *      Drop the packet on the floor
3690  */
3691
3692 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3693 {
3694         int type;
3695         struct dst_entry *dst = skb_dst(skb);
3696         switch (ipstats_mib_noroutes) {
3697         case IPSTATS_MIB_INNOROUTES:
3698                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3699                 if (type == IPV6_ADDR_ANY) {
3700                         IP6_INC_STATS(dev_net(dst->dev),
3701                                       __in6_dev_get_safely(skb->dev),
3702                                       IPSTATS_MIB_INADDRERRORS);
3703                         break;
3704                 }
3705                 /* FALLTHROUGH */
3706         case IPSTATS_MIB_OUTNOROUTES:
3707                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3708                               ipstats_mib_noroutes);
3709                 break;
3710         }
3711         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3712         kfree_skb(skb);
3713         return 0;
3714 }
3715
3716 static int ip6_pkt_discard(struct sk_buff *skb)
3717 {
3718         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3719 }
3720
3721 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3722 {
3723         skb->dev = skb_dst(skb)->dev;
3724         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3725 }
3726
3727 static int ip6_pkt_prohibit(struct sk_buff *skb)
3728 {
3729         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3730 }
3731
3732 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3733 {
3734         skb->dev = skb_dst(skb)->dev;
3735         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3736 }
3737
3738 /*
3739  *      Allocate a dst for local (unicast / anycast) address.
3740  */
3741
3742 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3743                                      struct inet6_dev *idev,
3744                                      const struct in6_addr *addr,
3745                                      bool anycast, gfp_t gfp_flags)
3746 {
3747         struct fib6_config cfg = {
3748                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3749                 .fc_ifindex = idev->dev->ifindex,
3750                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3751                 .fc_dst = *addr,
3752                 .fc_dst_len = 128,
3753                 .fc_protocol = RTPROT_KERNEL,
3754                 .fc_nlinfo.nl_net = net,
3755                 .fc_ignore_dev_down = true,
3756         };
3757
3758         if (anycast) {
3759                 cfg.fc_type = RTN_ANYCAST;
3760                 cfg.fc_flags |= RTF_ANYCAST;
3761         } else {
3762                 cfg.fc_type = RTN_LOCAL;
3763                 cfg.fc_flags |= RTF_LOCAL;
3764         }
3765
3766         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3767 }
3768
3769 /* remove deleted ip from prefsrc entries */
3770 struct arg_dev_net_ip {
3771         struct net_device *dev;
3772         struct net *net;
3773         struct in6_addr *addr;
3774 };
3775
3776 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3777 {
3778         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3779         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3780         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3781
3782         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3783             rt != net->ipv6.fib6_null_entry &&
3784             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3785                 spin_lock_bh(&rt6_exception_lock);
3786                 /* remove prefsrc entry */
3787                 rt->fib6_prefsrc.plen = 0;
3788                 spin_unlock_bh(&rt6_exception_lock);
3789         }
3790         return 0;
3791 }
3792
3793 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3794 {
3795         struct net *net = dev_net(ifp->idev->dev);
3796         struct arg_dev_net_ip adni = {
3797                 .dev = ifp->idev->dev,
3798                 .net = net,
3799                 .addr = &ifp->addr,
3800         };
3801         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3802 }
3803
3804 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3805
3806 /* Remove routers and update dst entries when gateway turn into host. */
3807 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3808 {
3809         struct in6_addr *gateway = (struct in6_addr *)arg;
3810
3811         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3812             rt->fib6_nh.fib_nh_gw_family &&
3813             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3814                 return -1;
3815         }
3816
3817         /* Further clean up cached routes in exception table.
3818          * This is needed because cached route may have a different
3819          * gateway than its 'parent' in the case of an ip redirect.
3820          */
3821         rt6_exceptions_clean_tohost(rt, gateway);
3822
3823         return 0;
3824 }
3825
3826 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3827 {
3828         fib6_clean_all(net, fib6_clean_tohost, gateway);
3829 }
3830
3831 struct arg_netdev_event {
3832         const struct net_device *dev;
3833         union {
3834                 unsigned int nh_flags;
3835                 unsigned long event;
3836         };
3837 };
3838
3839 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3840 {
3841         struct fib6_info *iter;
3842         struct fib6_node *fn;
3843
3844         fn = rcu_dereference_protected(rt->fib6_node,
3845                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3846         iter = rcu_dereference_protected(fn->leaf,
3847                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3848         while (iter) {
3849                 if (iter->fib6_metric == rt->fib6_metric &&
3850                     rt6_qualify_for_ecmp(iter))
3851                         return iter;
3852                 iter = rcu_dereference_protected(iter->fib6_next,
3853                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3854         }
3855
3856         return NULL;
3857 }
3858
3859 static bool rt6_is_dead(const struct fib6_info *rt)
3860 {
3861         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3862             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3863              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3864                 return true;
3865
3866         return false;
3867 }
3868
3869 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3870 {
3871         struct fib6_info *iter;
3872         int total = 0;
3873
3874         if (!rt6_is_dead(rt))
3875                 total += rt->fib6_nh.fib_nh_weight;
3876
3877         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3878                 if (!rt6_is_dead(iter))
3879                         total += iter->fib6_nh.fib_nh_weight;
3880         }
3881
3882         return total;
3883 }
3884
3885 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3886 {
3887         int upper_bound = -1;
3888
3889         if (!rt6_is_dead(rt)) {
3890                 *weight += rt->fib6_nh.fib_nh_weight;
3891                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3892                                                     total) - 1;
3893         }
3894         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3895 }
3896
3897 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3898 {
3899         struct fib6_info *iter;
3900         int weight = 0;
3901
3902         rt6_upper_bound_set(rt, &weight, total);
3903
3904         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3905                 rt6_upper_bound_set(iter, &weight, total);
3906 }
3907
3908 void rt6_multipath_rebalance(struct fib6_info *rt)
3909 {
3910         struct fib6_info *first;
3911         int total;
3912
3913         /* In case the entire multipath route was marked for flushing,
3914          * then there is no need to rebalance upon the removal of every
3915          * sibling route.
3916          */
3917         if (!rt->fib6_nsiblings || rt->should_flush)
3918                 return;
3919
3920         /* During lookup routes are evaluated in order, so we need to
3921          * make sure upper bounds are assigned from the first sibling
3922          * onwards.
3923          */
3924         first = rt6_multipath_first_sibling(rt);
3925         if (WARN_ON_ONCE(!first))
3926                 return;
3927
3928         total = rt6_multipath_total_weight(first);
3929         rt6_multipath_upper_bound_set(first, total);
3930 }
3931
3932 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3933 {
3934         const struct arg_netdev_event *arg = p_arg;
3935         struct net *net = dev_net(arg->dev);
3936
3937         if (rt != net->ipv6.fib6_null_entry &&
3938             rt->fib6_nh.fib_nh_dev == arg->dev) {
3939                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3940                 fib6_update_sernum_upto_root(net, rt);
3941                 rt6_multipath_rebalance(rt);
3942         }
3943
3944         return 0;
3945 }
3946
3947 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3948 {
3949         struct arg_netdev_event arg = {
3950                 .dev = dev,
3951                 {
3952                         .nh_flags = nh_flags,
3953                 },
3954         };
3955
3956         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3957                 arg.nh_flags |= RTNH_F_LINKDOWN;
3958
3959         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3960 }
3961
3962 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3963                                    const struct net_device *dev)
3964 {
3965         struct fib6_info *iter;
3966
3967         if (rt->fib6_nh.fib_nh_dev == dev)
3968                 return true;
3969         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3970                 if (iter->fib6_nh.fib_nh_dev == dev)
3971                         return true;
3972
3973         return false;
3974 }
3975
3976 static void rt6_multipath_flush(struct fib6_info *rt)
3977 {
3978         struct fib6_info *iter;
3979
3980         rt->should_flush = 1;
3981         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3982                 iter->should_flush = 1;
3983 }
3984
3985 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3986                                              const struct net_device *down_dev)
3987 {
3988         struct fib6_info *iter;
3989         unsigned int dead = 0;
3990
3991         if (rt->fib6_nh.fib_nh_dev == down_dev ||
3992             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
3993                 dead++;
3994         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3995                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
3996                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
3997                         dead++;
3998
3999         return dead;
4000 }
4001
4002 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4003                                        const struct net_device *dev,
4004                                        unsigned int nh_flags)
4005 {
4006         struct fib6_info *iter;
4007
4008         if (rt->fib6_nh.fib_nh_dev == dev)
4009                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4010         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4011                 if (iter->fib6_nh.fib_nh_dev == dev)
4012                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4013 }
4014
4015 /* called with write lock held for table with rt */
4016 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4017 {
4018         const struct arg_netdev_event *arg = p_arg;
4019         const struct net_device *dev = arg->dev;
4020         struct net *net = dev_net(dev);
4021
4022         if (rt == net->ipv6.fib6_null_entry)
4023                 return 0;
4024
4025         switch (arg->event) {
4026         case NETDEV_UNREGISTER:
4027                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4028         case NETDEV_DOWN:
4029                 if (rt->should_flush)
4030                         return -1;
4031                 if (!rt->fib6_nsiblings)
4032                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4033                 if (rt6_multipath_uses_dev(rt, dev)) {
4034                         unsigned int count;
4035
4036                         count = rt6_multipath_dead_count(rt, dev);
4037                         if (rt->fib6_nsiblings + 1 == count) {
4038                                 rt6_multipath_flush(rt);
4039                                 return -1;
4040                         }
4041                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4042                                                    RTNH_F_LINKDOWN);
4043                         fib6_update_sernum(net, rt);
4044                         rt6_multipath_rebalance(rt);
4045                 }
4046                 return -2;
4047         case NETDEV_CHANGE:
4048                 if (rt->fib6_nh.fib_nh_dev != dev ||
4049                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4050                         break;
4051                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4052                 rt6_multipath_rebalance(rt);
4053                 break;
4054         }
4055
4056         return 0;
4057 }
4058
4059 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4060 {
4061         struct arg_netdev_event arg = {
4062                 .dev = dev,
4063                 {
4064                         .event = event,
4065                 },
4066         };
4067         struct net *net = dev_net(dev);
4068
4069         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4070                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4071         else
4072                 fib6_clean_all(net, fib6_ifdown, &arg);
4073 }
4074
4075 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4076 {
4077         rt6_sync_down_dev(dev, event);
4078         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4079         neigh_ifdown(&nd_tbl, dev);
4080 }
4081
4082 struct rt6_mtu_change_arg {
4083         struct net_device *dev;
4084         unsigned int mtu;
4085 };
4086
4087 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4088 {
4089         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4090         struct inet6_dev *idev;
4091
4092         /* In IPv6 pmtu discovery is not optional,
4093            so that RTAX_MTU lock cannot disable it.
4094            We still use this lock to block changes
4095            caused by addrconf/ndisc.
4096         */
4097
4098         idev = __in6_dev_get(arg->dev);
4099         if (!idev)
4100                 return 0;
4101
4102         /* For administrative MTU increase, there is no way to discover
4103            IPv6 PMTU increase, so PMTU increase should be updated here.
4104            Since RFC 1981 doesn't include administrative MTU increase
4105            update PMTU increase is a MUST. (i.e. jumbo frame)
4106          */
4107         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4108             !fib6_metric_locked(rt, RTAX_MTU)) {
4109                 u32 mtu = rt->fib6_pmtu;
4110
4111                 if (mtu >= arg->mtu ||
4112                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4113                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4114
4115                 spin_lock_bh(&rt6_exception_lock);
4116                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4117                 spin_unlock_bh(&rt6_exception_lock);
4118         }
4119         return 0;
4120 }
4121
4122 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4123 {
4124         struct rt6_mtu_change_arg arg = {
4125                 .dev = dev,
4126                 .mtu = mtu,
4127         };
4128
4129         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4130 }
4131
4132 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4133         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4134         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4135         [RTA_OIF]               = { .type = NLA_U32 },
4136         [RTA_IIF]               = { .type = NLA_U32 },
4137         [RTA_PRIORITY]          = { .type = NLA_U32 },
4138         [RTA_METRICS]           = { .type = NLA_NESTED },
4139         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4140         [RTA_PREF]              = { .type = NLA_U8 },
4141         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4142         [RTA_ENCAP]             = { .type = NLA_NESTED },
4143         [RTA_EXPIRES]           = { .type = NLA_U32 },
4144         [RTA_UID]               = { .type = NLA_U32 },
4145         [RTA_MARK]              = { .type = NLA_U32 },
4146         [RTA_TABLE]             = { .type = NLA_U32 },
4147         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4148         [RTA_SPORT]             = { .type = NLA_U16 },
4149         [RTA_DPORT]             = { .type = NLA_U16 },
4150 };
4151
4152 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4153                               struct fib6_config *cfg,
4154                               struct netlink_ext_ack *extack)
4155 {
4156         struct rtmsg *rtm;
4157         struct nlattr *tb[RTA_MAX+1];
4158         unsigned int pref;
4159         int err;
4160
4161         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4162                           extack);
4163         if (err < 0)
4164                 goto errout;
4165
4166         err = -EINVAL;
4167         rtm = nlmsg_data(nlh);
4168
4169         *cfg = (struct fib6_config){
4170                 .fc_table = rtm->rtm_table,
4171                 .fc_dst_len = rtm->rtm_dst_len,
4172                 .fc_src_len = rtm->rtm_src_len,
4173                 .fc_flags = RTF_UP,
4174                 .fc_protocol = rtm->rtm_protocol,
4175                 .fc_type = rtm->rtm_type,
4176
4177                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4178                 .fc_nlinfo.nlh = nlh,
4179                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4180         };
4181
4182         if (rtm->rtm_type == RTN_UNREACHABLE ||
4183             rtm->rtm_type == RTN_BLACKHOLE ||
4184             rtm->rtm_type == RTN_PROHIBIT ||
4185             rtm->rtm_type == RTN_THROW)
4186                 cfg->fc_flags |= RTF_REJECT;
4187
4188         if (rtm->rtm_type == RTN_LOCAL)
4189                 cfg->fc_flags |= RTF_LOCAL;
4190
4191         if (rtm->rtm_flags & RTM_F_CLONED)
4192                 cfg->fc_flags |= RTF_CACHE;
4193
4194         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4195
4196         if (tb[RTA_GATEWAY]) {
4197                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4198                 cfg->fc_flags |= RTF_GATEWAY;
4199         }
4200         if (tb[RTA_VIA]) {
4201                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4202                 goto errout;
4203         }
4204
4205         if (tb[RTA_DST]) {
4206                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4207
4208                 if (nla_len(tb[RTA_DST]) < plen)
4209                         goto errout;
4210
4211                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4212         }
4213
4214         if (tb[RTA_SRC]) {
4215                 int plen = (rtm->rtm_src_len + 7) >> 3;
4216
4217                 if (nla_len(tb[RTA_SRC]) < plen)
4218                         goto errout;
4219
4220                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4221         }
4222
4223         if (tb[RTA_PREFSRC])
4224                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4225
4226         if (tb[RTA_OIF])
4227                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4228
4229         if (tb[RTA_PRIORITY])
4230                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4231
4232         if (tb[RTA_METRICS]) {
4233                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4234                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4235         }
4236
4237         if (tb[RTA_TABLE])
4238                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4239
4240         if (tb[RTA_MULTIPATH]) {
4241                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4242                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4243
4244                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4245                                                      cfg->fc_mp_len, extack);
4246                 if (err < 0)
4247                         goto errout;
4248         }
4249
4250         if (tb[RTA_PREF]) {
4251                 pref = nla_get_u8(tb[RTA_PREF]);
4252                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4253                     pref != ICMPV6_ROUTER_PREF_HIGH)
4254                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4255                 cfg->fc_flags |= RTF_PREF(pref);
4256         }
4257
4258         if (tb[RTA_ENCAP])
4259                 cfg->fc_encap = tb[RTA_ENCAP];
4260
4261         if (tb[RTA_ENCAP_TYPE]) {
4262                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4263
4264                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4265                 if (err < 0)
4266                         goto errout;
4267         }
4268
4269         if (tb[RTA_EXPIRES]) {
4270                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4271
4272                 if (addrconf_finite_timeout(timeout)) {
4273                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4274                         cfg->fc_flags |= RTF_EXPIRES;
4275                 }
4276         }
4277
4278         err = 0;
4279 errout:
4280         return err;
4281 }
4282
4283 struct rt6_nh {
4284         struct fib6_info *fib6_info;
4285         struct fib6_config r_cfg;
4286         struct list_head next;
4287 };
4288
4289 static int ip6_route_info_append(struct net *net,
4290                                  struct list_head *rt6_nh_list,
4291                                  struct fib6_info *rt,
4292                                  struct fib6_config *r_cfg)
4293 {
4294         struct rt6_nh *nh;
4295         int err = -EEXIST;
4296
4297         list_for_each_entry(nh, rt6_nh_list, next) {
4298                 /* check if fib6_info already exists */
4299                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4300                         return err;
4301         }
4302
4303         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4304         if (!nh)
4305                 return -ENOMEM;
4306         nh->fib6_info = rt;
4307         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4308         list_add_tail(&nh->next, rt6_nh_list);
4309
4310         return 0;
4311 }
4312
4313 static void ip6_route_mpath_notify(struct fib6_info *rt,
4314                                    struct fib6_info *rt_last,
4315                                    struct nl_info *info,
4316                                    __u16 nlflags)
4317 {
4318         /* if this is an APPEND route, then rt points to the first route
4319          * inserted and rt_last points to last route inserted. Userspace
4320          * wants a consistent dump of the route which starts at the first
4321          * nexthop. Since sibling routes are always added at the end of
4322          * the list, find the first sibling of the last route appended
4323          */
4324         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4325                 rt = list_first_entry(&rt_last->fib6_siblings,
4326                                       struct fib6_info,
4327                                       fib6_siblings);
4328         }
4329
4330         if (rt)
4331                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4332 }
4333
4334 static int ip6_route_multipath_add(struct fib6_config *cfg,
4335                                    struct netlink_ext_ack *extack)
4336 {
4337         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4338         struct nl_info *info = &cfg->fc_nlinfo;
4339         struct fib6_config r_cfg;
4340         struct rtnexthop *rtnh;
4341         struct fib6_info *rt;
4342         struct rt6_nh *err_nh;
4343         struct rt6_nh *nh, *nh_safe;
4344         __u16 nlflags;
4345         int remaining;
4346         int attrlen;
4347         int err = 1;
4348         int nhn = 0;
4349         int replace = (cfg->fc_nlinfo.nlh &&
4350                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4351         LIST_HEAD(rt6_nh_list);
4352
4353         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4354         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4355                 nlflags |= NLM_F_APPEND;
4356
4357         remaining = cfg->fc_mp_len;
4358         rtnh = (struct rtnexthop *)cfg->fc_mp;
4359
4360         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4361          * fib6_info structs per nexthop
4362          */
4363         while (rtnh_ok(rtnh, remaining)) {
4364                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4365                 if (rtnh->rtnh_ifindex)
4366                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4367
4368                 attrlen = rtnh_attrlen(rtnh);
4369                 if (attrlen > 0) {
4370                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4371
4372                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4373                         if (nla) {
4374                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4375                                 r_cfg.fc_flags |= RTF_GATEWAY;
4376                         }
4377                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4378                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4379                         if (nla)
4380                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4381                 }
4382
4383                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4384                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4385                 if (IS_ERR(rt)) {
4386                         err = PTR_ERR(rt);
4387                         rt = NULL;
4388                         goto cleanup;
4389                 }
4390                 if (!rt6_qualify_for_ecmp(rt)) {
4391                         err = -EINVAL;
4392                         NL_SET_ERR_MSG(extack,
4393                                        "Device only routes can not be added for IPv6 using the multipath API.");
4394                         fib6_info_release(rt);
4395                         goto cleanup;
4396                 }
4397
4398                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4399
4400                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4401                                             rt, &r_cfg);
4402                 if (err) {
4403                         fib6_info_release(rt);
4404                         goto cleanup;
4405                 }
4406
4407                 rtnh = rtnh_next(rtnh, &remaining);
4408         }
4409
4410         /* for add and replace send one notification with all nexthops.
4411          * Skip the notification in fib6_add_rt2node and send one with
4412          * the full route when done
4413          */
4414         info->skip_notify = 1;
4415
4416         err_nh = NULL;
4417         list_for_each_entry(nh, &rt6_nh_list, next) {
4418                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4419                 fib6_info_release(nh->fib6_info);
4420
4421                 if (!err) {
4422                         /* save reference to last route successfully inserted */
4423                         rt_last = nh->fib6_info;
4424
4425                         /* save reference to first route for notification */
4426                         if (!rt_notif)
4427                                 rt_notif = nh->fib6_info;
4428                 }
4429
4430                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4431                 nh->fib6_info = NULL;
4432                 if (err) {
4433                         if (replace && nhn)
4434                                 NL_SET_ERR_MSG_MOD(extack,
4435                                                    "multipath route replace failed (check consistency of installed routes)");
4436                         err_nh = nh;
4437                         goto add_errout;
4438                 }
4439
4440                 /* Because each route is added like a single route we remove
4441                  * these flags after the first nexthop: if there is a collision,
4442                  * we have already failed to add the first nexthop:
4443                  * fib6_add_rt2node() has rejected it; when replacing, old
4444                  * nexthops have been replaced by first new, the rest should
4445                  * be added to it.
4446                  */
4447                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4448                                                      NLM_F_REPLACE);
4449                 nhn++;
4450         }
4451
4452         /* success ... tell user about new route */
4453         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4454         goto cleanup;
4455
4456 add_errout:
4457         /* send notification for routes that were added so that
4458          * the delete notifications sent by ip6_route_del are
4459          * coherent
4460          */
4461         if (rt_notif)
4462                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4463
4464         /* Delete routes that were already added */
4465         list_for_each_entry(nh, &rt6_nh_list, next) {
4466                 if (err_nh == nh)
4467                         break;
4468                 ip6_route_del(&nh->r_cfg, extack);
4469         }
4470
4471 cleanup:
4472         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4473                 if (nh->fib6_info)
4474                         fib6_info_release(nh->fib6_info);
4475                 list_del(&nh->next);
4476                 kfree(nh);
4477         }
4478
4479         return err;
4480 }
4481
4482 static int ip6_route_multipath_del(struct fib6_config *cfg,
4483                                    struct netlink_ext_ack *extack)
4484 {
4485         struct fib6_config r_cfg;
4486         struct rtnexthop *rtnh;
4487         int remaining;
4488         int attrlen;
4489         int err = 1, last_err = 0;
4490
4491         remaining = cfg->fc_mp_len;
4492         rtnh = (struct rtnexthop *)cfg->fc_mp;
4493
4494         /* Parse a Multipath Entry */
4495         while (rtnh_ok(rtnh, remaining)) {
4496                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4497                 if (rtnh->rtnh_ifindex)
4498                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4499
4500                 attrlen = rtnh_attrlen(rtnh);
4501                 if (attrlen > 0) {
4502                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4503
4504                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4505                         if (nla) {
4506                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4507                                 r_cfg.fc_flags |= RTF_GATEWAY;
4508                         }
4509                 }
4510                 err = ip6_route_del(&r_cfg, extack);
4511                 if (err)
4512                         last_err = err;
4513
4514                 rtnh = rtnh_next(rtnh, &remaining);
4515         }
4516
4517         return last_err;
4518 }
4519
4520 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4521                               struct netlink_ext_ack *extack)
4522 {
4523         struct fib6_config cfg;
4524         int err;
4525
4526         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4527         if (err < 0)
4528                 return err;
4529
4530         if (cfg.fc_mp)
4531                 return ip6_route_multipath_del(&cfg, extack);
4532         else {
4533                 cfg.fc_delete_all_nh = 1;
4534                 return ip6_route_del(&cfg, extack);
4535         }
4536 }
4537
4538 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4539                               struct netlink_ext_ack *extack)
4540 {
4541         struct fib6_config cfg;
4542         int err;
4543
4544         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4545         if (err < 0)
4546                 return err;
4547
4548         if (cfg.fc_metric == 0)
4549                 cfg.fc_metric = IP6_RT_PRIO_USER;
4550
4551         if (cfg.fc_mp)
4552                 return ip6_route_multipath_add(&cfg, extack);
4553         else
4554                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4555 }
4556
4557 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4558 {
4559         int nexthop_len = 0;
4560
4561         if (rt->fib6_nsiblings) {
4562                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4563                             + NLA_ALIGN(sizeof(struct rtnexthop))
4564                             + nla_total_size(16) /* RTA_GATEWAY */
4565                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4566
4567                 nexthop_len *= rt->fib6_nsiblings;
4568         }
4569
4570         return NLMSG_ALIGN(sizeof(struct rtmsg))
4571                + nla_total_size(16) /* RTA_SRC */
4572                + nla_total_size(16) /* RTA_DST */
4573                + nla_total_size(16) /* RTA_GATEWAY */
4574                + nla_total_size(16) /* RTA_PREFSRC */
4575                + nla_total_size(4) /* RTA_TABLE */
4576                + nla_total_size(4) /* RTA_IIF */
4577                + nla_total_size(4) /* RTA_OIF */
4578                + nla_total_size(4) /* RTA_PRIORITY */
4579                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4580                + nla_total_size(sizeof(struct rta_cacheinfo))
4581                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4582                + nla_total_size(1) /* RTA_PREF */
4583                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4584                + nexthop_len;
4585 }
4586
4587 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4588                          struct fib6_info *rt, struct dst_entry *dst,
4589                          struct in6_addr *dest, struct in6_addr *src,
4590                          int iif, int type, u32 portid, u32 seq,
4591                          unsigned int flags)
4592 {
4593         struct rt6_info *rt6 = (struct rt6_info *)dst;
4594         struct rt6key *rt6_dst, *rt6_src;
4595         u32 *pmetrics, table, rt6_flags;
4596         struct nlmsghdr *nlh;
4597         struct rtmsg *rtm;
4598         long expires = 0;
4599
4600         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4601         if (!nlh)
4602                 return -EMSGSIZE;
4603
4604         if (rt6) {
4605                 rt6_dst = &rt6->rt6i_dst;
4606                 rt6_src = &rt6->rt6i_src;
4607                 rt6_flags = rt6->rt6i_flags;
4608         } else {
4609                 rt6_dst = &rt->fib6_dst;
4610                 rt6_src = &rt->fib6_src;
4611                 rt6_flags = rt->fib6_flags;
4612         }
4613
4614         rtm = nlmsg_data(nlh);
4615         rtm->rtm_family = AF_INET6;
4616         rtm->rtm_dst_len = rt6_dst->plen;
4617         rtm->rtm_src_len = rt6_src->plen;
4618         rtm->rtm_tos = 0;
4619         if (rt->fib6_table)
4620                 table = rt->fib6_table->tb6_id;
4621         else
4622                 table = RT6_TABLE_UNSPEC;
4623         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4624         if (nla_put_u32(skb, RTA_TABLE, table))
4625                 goto nla_put_failure;
4626
4627         rtm->rtm_type = rt->fib6_type;
4628         rtm->rtm_flags = 0;
4629         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4630         rtm->rtm_protocol = rt->fib6_protocol;
4631
4632         if (rt6_flags & RTF_CACHE)
4633                 rtm->rtm_flags |= RTM_F_CLONED;
4634
4635         if (dest) {
4636                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4637                         goto nla_put_failure;
4638                 rtm->rtm_dst_len = 128;
4639         } else if (rtm->rtm_dst_len)
4640                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4641                         goto nla_put_failure;
4642 #ifdef CONFIG_IPV6_SUBTREES
4643         if (src) {
4644                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4645                         goto nla_put_failure;
4646                 rtm->rtm_src_len = 128;
4647         } else if (rtm->rtm_src_len &&
4648                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4649                 goto nla_put_failure;
4650 #endif
4651         if (iif) {
4652 #ifdef CONFIG_IPV6_MROUTE
4653                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4654                         int err = ip6mr_get_route(net, skb, rtm, portid);
4655
4656                         if (err == 0)
4657                                 return 0;
4658                         if (err < 0)
4659                                 goto nla_put_failure;
4660                 } else
4661 #endif
4662                         if (nla_put_u32(skb, RTA_IIF, iif))
4663                                 goto nla_put_failure;
4664         } else if (dest) {
4665                 struct in6_addr saddr_buf;
4666                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4667                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4668                         goto nla_put_failure;
4669         }
4670
4671         if (rt->fib6_prefsrc.plen) {
4672                 struct in6_addr saddr_buf;
4673                 saddr_buf = rt->fib6_prefsrc.addr;
4674                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4675                         goto nla_put_failure;
4676         }
4677
4678         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4679         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4680                 goto nla_put_failure;
4681
4682         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4683                 goto nla_put_failure;
4684
4685         /* For multipath routes, walk the siblings list and add
4686          * each as a nexthop within RTA_MULTIPATH.
4687          */
4688         if (rt6) {
4689                 if (rt6_flags & RTF_GATEWAY &&
4690                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4691                         goto nla_put_failure;
4692
4693                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4694                         goto nla_put_failure;
4695         } else if (rt->fib6_nsiblings) {
4696                 struct fib6_info *sibling, *next_sibling;
4697                 struct nlattr *mp;
4698
4699                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4700                 if (!mp)
4701                         goto nla_put_failure;
4702
4703                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4704                                     rt->fib6_nh.fib_nh_weight) < 0)
4705                         goto nla_put_failure;
4706
4707                 list_for_each_entry_safe(sibling, next_sibling,
4708                                          &rt->fib6_siblings, fib6_siblings) {
4709                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4710                                             sibling->fib6_nh.fib_nh_weight) < 0)
4711                                 goto nla_put_failure;
4712                 }
4713
4714                 nla_nest_end(skb, mp);
4715         } else {
4716                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4717                                      &rtm->rtm_flags, false) < 0)
4718                         goto nla_put_failure;
4719         }
4720
4721         if (rt6_flags & RTF_EXPIRES) {
4722                 expires = dst ? dst->expires : rt->expires;
4723                 expires -= jiffies;
4724         }
4725
4726         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4727                 goto nla_put_failure;
4728
4729         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4730                 goto nla_put_failure;
4731
4732
4733         nlmsg_end(skb, nlh);
4734         return 0;
4735
4736 nla_put_failure:
4737         nlmsg_cancel(skb, nlh);
4738         return -EMSGSIZE;
4739 }
4740
4741 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4742                                const struct net_device *dev)
4743 {
4744         if (f6i->fib6_nh.fib_nh_dev == dev)
4745                 return true;
4746
4747         if (f6i->fib6_nsiblings) {
4748                 struct fib6_info *sibling, *next_sibling;
4749
4750                 list_for_each_entry_safe(sibling, next_sibling,
4751                                          &f6i->fib6_siblings, fib6_siblings) {
4752                         if (sibling->fib6_nh.fib_nh_dev == dev)
4753                                 return true;
4754                 }
4755         }
4756
4757         return false;
4758 }
4759
4760 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4761 {
4762         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4763         struct fib_dump_filter *filter = &arg->filter;
4764         unsigned int flags = NLM_F_MULTI;
4765         struct net *net = arg->net;
4766
4767         if (rt == net->ipv6.fib6_null_entry)
4768                 return 0;
4769
4770         if ((filter->flags & RTM_F_PREFIX) &&
4771             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4772                 /* success since this is not a prefix route */
4773                 return 1;
4774         }
4775         if (filter->filter_set) {
4776                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4777                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4778                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4779                         return 1;
4780                 }
4781                 flags |= NLM_F_DUMP_FILTERED;
4782         }
4783
4784         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4785                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4786                              arg->cb->nlh->nlmsg_seq, flags);
4787 }
4788
4789 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4790                                         const struct nlmsghdr *nlh,
4791                                         struct nlattr **tb,
4792                                         struct netlink_ext_ack *extack)
4793 {
4794         struct rtmsg *rtm;
4795         int i, err;
4796
4797         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4798                 NL_SET_ERR_MSG_MOD(extack,
4799                                    "Invalid header for get route request");
4800                 return -EINVAL;
4801         }
4802
4803         if (!netlink_strict_get_check(skb))
4804                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4805                                    rtm_ipv6_policy, extack);
4806
4807         rtm = nlmsg_data(nlh);
4808         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4809             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4810             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4811             rtm->rtm_type) {
4812                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4813                 return -EINVAL;
4814         }
4815         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4816                 NL_SET_ERR_MSG_MOD(extack,
4817                                    "Invalid flags for get route request");
4818                 return -EINVAL;
4819         }
4820
4821         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4822                                  rtm_ipv6_policy, extack);
4823         if (err)
4824                 return err;
4825
4826         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4827             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4828                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4829                 return -EINVAL;
4830         }
4831
4832         for (i = 0; i <= RTA_MAX; i++) {
4833                 if (!tb[i])
4834                         continue;
4835
4836                 switch (i) {
4837                 case RTA_SRC:
4838                 case RTA_DST:
4839                 case RTA_IIF:
4840                 case RTA_OIF:
4841                 case RTA_MARK:
4842                 case RTA_UID:
4843                 case RTA_SPORT:
4844                 case RTA_DPORT:
4845                 case RTA_IP_PROTO:
4846                         break;
4847                 default:
4848                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4849                         return -EINVAL;
4850                 }
4851         }
4852
4853         return 0;
4854 }
4855
4856 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4857                               struct netlink_ext_ack *extack)
4858 {
4859         struct net *net = sock_net(in_skb->sk);
4860         struct nlattr *tb[RTA_MAX+1];
4861         int err, iif = 0, oif = 0;
4862         struct fib6_info *from;
4863         struct dst_entry *dst;
4864         struct rt6_info *rt;
4865         struct sk_buff *skb;
4866         struct rtmsg *rtm;
4867         struct flowi6 fl6 = {};
4868         bool fibmatch;
4869
4870         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4871         if (err < 0)
4872                 goto errout;
4873
4874         err = -EINVAL;
4875         rtm = nlmsg_data(nlh);
4876         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4877         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4878
4879         if (tb[RTA_SRC]) {
4880                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4881                         goto errout;
4882
4883                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4884         }
4885
4886         if (tb[RTA_DST]) {
4887                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4888                         goto errout;
4889
4890                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4891         }
4892
4893         if (tb[RTA_IIF])
4894                 iif = nla_get_u32(tb[RTA_IIF]);
4895
4896         if (tb[RTA_OIF])
4897                 oif = nla_get_u32(tb[RTA_OIF]);
4898
4899         if (tb[RTA_MARK])
4900                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4901
4902         if (tb[RTA_UID])
4903                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4904                                            nla_get_u32(tb[RTA_UID]));
4905         else
4906                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4907
4908         if (tb[RTA_SPORT])
4909                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4910
4911         if (tb[RTA_DPORT])
4912                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4913
4914         if (tb[RTA_IP_PROTO]) {
4915                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4916                                                   &fl6.flowi6_proto, AF_INET6,
4917                                                   extack);
4918                 if (err)
4919                         goto errout;
4920         }
4921
4922         if (iif) {
4923                 struct net_device *dev;
4924                 int flags = 0;
4925
4926                 rcu_read_lock();
4927
4928                 dev = dev_get_by_index_rcu(net, iif);
4929                 if (!dev) {
4930                         rcu_read_unlock();
4931                         err = -ENODEV;
4932                         goto errout;
4933                 }
4934
4935                 fl6.flowi6_iif = iif;
4936
4937                 if (!ipv6_addr_any(&fl6.saddr))
4938                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4939
4940                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4941
4942                 rcu_read_unlock();
4943         } else {
4944                 fl6.flowi6_oif = oif;
4945
4946                 dst = ip6_route_output(net, NULL, &fl6);
4947         }
4948
4949
4950         rt = container_of(dst, struct rt6_info, dst);
4951         if (rt->dst.error) {
4952                 err = rt->dst.error;
4953                 ip6_rt_put(rt);
4954                 goto errout;
4955         }
4956
4957         if (rt == net->ipv6.ip6_null_entry) {
4958                 err = rt->dst.error;
4959                 ip6_rt_put(rt);
4960                 goto errout;
4961         }
4962
4963         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4964         if (!skb) {
4965                 ip6_rt_put(rt);
4966                 err = -ENOBUFS;
4967                 goto errout;
4968         }
4969
4970         skb_dst_set(skb, &rt->dst);
4971
4972         rcu_read_lock();
4973         from = rcu_dereference(rt->from);
4974
4975         if (fibmatch)
4976                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4977                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4978                                     nlh->nlmsg_seq, 0);
4979         else
4980                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4981                                     &fl6.saddr, iif, RTM_NEWROUTE,
4982                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4983                                     0);
4984         rcu_read_unlock();
4985
4986         if (err < 0) {
4987                 kfree_skb(skb);
4988                 goto errout;
4989         }
4990
4991         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4992 errout:
4993         return err;
4994 }
4995
4996 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4997                      unsigned int nlm_flags)
4998 {
4999         struct sk_buff *skb;
5000         struct net *net = info->nl_net;
5001         u32 seq;
5002         int err;
5003
5004         err = -ENOBUFS;
5005         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5006
5007         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5008         if (!skb)
5009                 goto errout;
5010
5011         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5012                             event, info->portid, seq, nlm_flags);
5013         if (err < 0) {
5014                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5015                 WARN_ON(err == -EMSGSIZE);
5016                 kfree_skb(skb);
5017                 goto errout;
5018         }
5019         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5020                     info->nlh, gfp_any());
5021         return;
5022 errout:
5023         if (err < 0)
5024                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5025 }
5026
5027 static int ip6_route_dev_notify(struct notifier_block *this,
5028                                 unsigned long event, void *ptr)
5029 {
5030         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5031         struct net *net = dev_net(dev);
5032
5033         if (!(dev->flags & IFF_LOOPBACK))
5034                 return NOTIFY_OK;
5035
5036         if (event == NETDEV_REGISTER) {
5037                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5038                 net->ipv6.ip6_null_entry->dst.dev = dev;
5039                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5040 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5041                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5042                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5043                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5044                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5045 #endif
5046          } else if (event == NETDEV_UNREGISTER &&
5047                     dev->reg_state != NETREG_UNREGISTERED) {
5048                 /* NETDEV_UNREGISTER could be fired for multiple times by
5049                  * netdev_wait_allrefs(). Make sure we only call this once.
5050                  */
5051                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5052 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5053                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5054                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5055 #endif
5056         }
5057
5058         return NOTIFY_OK;
5059 }
5060
5061 /*
5062  *      /proc
5063  */
5064
5065 #ifdef CONFIG_PROC_FS
5066 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5067 {
5068         struct net *net = (struct net *)seq->private;
5069         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5070                    net->ipv6.rt6_stats->fib_nodes,
5071                    net->ipv6.rt6_stats->fib_route_nodes,
5072                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5073                    net->ipv6.rt6_stats->fib_rt_entries,
5074                    net->ipv6.rt6_stats->fib_rt_cache,
5075                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5076                    net->ipv6.rt6_stats->fib_discarded_routes);
5077
5078         return 0;
5079 }
5080 #endif  /* CONFIG_PROC_FS */
5081
5082 #ifdef CONFIG_SYSCTL
5083
5084 static
5085 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5086                               void __user *buffer, size_t *lenp, loff_t *ppos)
5087 {
5088         struct net *net;
5089         int delay;
5090         int ret;
5091         if (!write)
5092                 return -EINVAL;
5093
5094         net = (struct net *)ctl->extra1;
5095         delay = net->ipv6.sysctl.flush_delay;
5096         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5097         if (ret)
5098                 return ret;
5099
5100         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5101         return 0;
5102 }
5103
5104 static int zero;
5105 static int one = 1;
5106
5107 static struct ctl_table ipv6_route_table_template[] = {
5108         {
5109                 .procname       =       "flush",
5110                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5111                 .maxlen         =       sizeof(int),
5112                 .mode           =       0200,
5113                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5114         },
5115         {
5116                 .procname       =       "gc_thresh",
5117                 .data           =       &ip6_dst_ops_template.gc_thresh,
5118                 .maxlen         =       sizeof(int),
5119                 .mode           =       0644,
5120                 .proc_handler   =       proc_dointvec,
5121         },
5122         {
5123                 .procname       =       "max_size",
5124                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5125                 .maxlen         =       sizeof(int),
5126                 .mode           =       0644,
5127                 .proc_handler   =       proc_dointvec,
5128         },
5129         {
5130                 .procname       =       "gc_min_interval",
5131                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5132                 .maxlen         =       sizeof(int),
5133                 .mode           =       0644,
5134                 .proc_handler   =       proc_dointvec_jiffies,
5135         },
5136         {
5137                 .procname       =       "gc_timeout",
5138                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5139                 .maxlen         =       sizeof(int),
5140                 .mode           =       0644,
5141                 .proc_handler   =       proc_dointvec_jiffies,
5142         },
5143         {
5144                 .procname       =       "gc_interval",
5145                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5146                 .maxlen         =       sizeof(int),
5147                 .mode           =       0644,
5148                 .proc_handler   =       proc_dointvec_jiffies,
5149         },
5150         {
5151                 .procname       =       "gc_elasticity",
5152                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5153                 .maxlen         =       sizeof(int),
5154                 .mode           =       0644,
5155                 .proc_handler   =       proc_dointvec,
5156         },
5157         {
5158                 .procname       =       "mtu_expires",
5159                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5160                 .maxlen         =       sizeof(int),
5161                 .mode           =       0644,
5162                 .proc_handler   =       proc_dointvec_jiffies,
5163         },
5164         {
5165                 .procname       =       "min_adv_mss",
5166                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5167                 .maxlen         =       sizeof(int),
5168                 .mode           =       0644,
5169                 .proc_handler   =       proc_dointvec,
5170         },
5171         {
5172                 .procname       =       "gc_min_interval_ms",
5173                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5174                 .maxlen         =       sizeof(int),
5175                 .mode           =       0644,
5176                 .proc_handler   =       proc_dointvec_ms_jiffies,
5177         },
5178         {
5179                 .procname       =       "skip_notify_on_dev_down",
5180                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5181                 .maxlen         =       sizeof(int),
5182                 .mode           =       0644,
5183                 .proc_handler   =       proc_dointvec,
5184                 .extra1         =       &zero,
5185                 .extra2         =       &one,
5186         },
5187         { }
5188 };
5189
5190 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5191 {
5192         struct ctl_table *table;
5193
5194         table = kmemdup(ipv6_route_table_template,
5195                         sizeof(ipv6_route_table_template),
5196                         GFP_KERNEL);
5197
5198         if (table) {
5199                 table[0].data = &net->ipv6.sysctl.flush_delay;
5200                 table[0].extra1 = net;
5201                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5202                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5203                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5204                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5205                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5206                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5207                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5208                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5209                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5210                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5211
5212                 /* Don't export sysctls to unprivileged users */
5213                 if (net->user_ns != &init_user_ns)
5214                         table[0].procname = NULL;
5215         }
5216
5217         return table;
5218 }
5219 #endif
5220
5221 static int __net_init ip6_route_net_init(struct net *net)
5222 {
5223         int ret = -ENOMEM;
5224
5225         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5226                sizeof(net->ipv6.ip6_dst_ops));
5227
5228         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5229                 goto out_ip6_dst_ops;
5230
5231         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5232                                             sizeof(*net->ipv6.fib6_null_entry),
5233                                             GFP_KERNEL);
5234         if (!net->ipv6.fib6_null_entry)
5235                 goto out_ip6_dst_entries;
5236
5237         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5238                                            sizeof(*net->ipv6.ip6_null_entry),
5239                                            GFP_KERNEL);
5240         if (!net->ipv6.ip6_null_entry)
5241                 goto out_fib6_null_entry;
5242         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5243         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5244                          ip6_template_metrics, true);
5245
5246 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5247         net->ipv6.fib6_has_custom_rules = false;
5248         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5249                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5250                                                GFP_KERNEL);
5251         if (!net->ipv6.ip6_prohibit_entry)
5252                 goto out_ip6_null_entry;
5253         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5254         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5255                          ip6_template_metrics, true);
5256
5257         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5258                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5259                                                GFP_KERNEL);
5260         if (!net->ipv6.ip6_blk_hole_entry)
5261                 goto out_ip6_prohibit_entry;
5262         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5263         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5264                          ip6_template_metrics, true);
5265 #endif
5266
5267         net->ipv6.sysctl.flush_delay = 0;
5268         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5269         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5270         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5271         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5272         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5273         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5274         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5275         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5276
5277         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5278
5279         ret = 0;
5280 out:
5281         return ret;
5282
5283 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5284 out_ip6_prohibit_entry:
5285         kfree(net->ipv6.ip6_prohibit_entry);
5286 out_ip6_null_entry:
5287         kfree(net->ipv6.ip6_null_entry);
5288 #endif
5289 out_fib6_null_entry:
5290         kfree(net->ipv6.fib6_null_entry);
5291 out_ip6_dst_entries:
5292         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5293 out_ip6_dst_ops:
5294         goto out;
5295 }
5296
5297 static void __net_exit ip6_route_net_exit(struct net *net)
5298 {
5299         kfree(net->ipv6.fib6_null_entry);
5300         kfree(net->ipv6.ip6_null_entry);
5301 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5302         kfree(net->ipv6.ip6_prohibit_entry);
5303         kfree(net->ipv6.ip6_blk_hole_entry);
5304 #endif
5305         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5306 }
5307
5308 static int __net_init ip6_route_net_init_late(struct net *net)
5309 {
5310 #ifdef CONFIG_PROC_FS
5311         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5312                         sizeof(struct ipv6_route_iter));
5313         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5314                         rt6_stats_seq_show, NULL);
5315 #endif
5316         return 0;
5317 }
5318
5319 static void __net_exit ip6_route_net_exit_late(struct net *net)
5320 {
5321 #ifdef CONFIG_PROC_FS
5322         remove_proc_entry("ipv6_route", net->proc_net);
5323         remove_proc_entry("rt6_stats", net->proc_net);
5324 #endif
5325 }
5326
5327 static struct pernet_operations ip6_route_net_ops = {
5328         .init = ip6_route_net_init,
5329         .exit = ip6_route_net_exit,
5330 };
5331
5332 static int __net_init ipv6_inetpeer_init(struct net *net)
5333 {
5334         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5335
5336         if (!bp)
5337                 return -ENOMEM;
5338         inet_peer_base_init(bp);
5339         net->ipv6.peers = bp;
5340         return 0;
5341 }
5342
5343 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5344 {
5345         struct inet_peer_base *bp = net->ipv6.peers;
5346
5347         net->ipv6.peers = NULL;
5348         inetpeer_invalidate_tree(bp);
5349         kfree(bp);
5350 }
5351
5352 static struct pernet_operations ipv6_inetpeer_ops = {
5353         .init   =       ipv6_inetpeer_init,
5354         .exit   =       ipv6_inetpeer_exit,
5355 };
5356
5357 static struct pernet_operations ip6_route_net_late_ops = {
5358         .init = ip6_route_net_init_late,
5359         .exit = ip6_route_net_exit_late,
5360 };
5361
5362 static struct notifier_block ip6_route_dev_notifier = {
5363         .notifier_call = ip6_route_dev_notify,
5364         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5365 };
5366
5367 void __init ip6_route_init_special_entries(void)
5368 {
5369         /* Registering of the loopback is done before this portion of code,
5370          * the loopback reference in rt6_info will not be taken, do it
5371          * manually for init_net */
5372         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5373         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5374         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5375   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5376         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5377         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5378         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5379         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5380   #endif
5381 }
5382
5383 int __init ip6_route_init(void)
5384 {
5385         int ret;
5386         int cpu;
5387
5388         ret = -ENOMEM;
5389         ip6_dst_ops_template.kmem_cachep =
5390                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5391                                   SLAB_HWCACHE_ALIGN, NULL);
5392         if (!ip6_dst_ops_template.kmem_cachep)
5393                 goto out;
5394
5395         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5396         if (ret)
5397                 goto out_kmem_cache;
5398
5399         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5400         if (ret)
5401                 goto out_dst_entries;
5402
5403         ret = register_pernet_subsys(&ip6_route_net_ops);
5404         if (ret)
5405                 goto out_register_inetpeer;
5406
5407         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5408
5409         ret = fib6_init();
5410         if (ret)
5411                 goto out_register_subsys;
5412
5413         ret = xfrm6_init();
5414         if (ret)
5415                 goto out_fib6_init;
5416
5417         ret = fib6_rules_init();
5418         if (ret)
5419                 goto xfrm6_init;
5420
5421         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5422         if (ret)
5423                 goto fib6_rules_init;
5424
5425         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5426                                    inet6_rtm_newroute, NULL, 0);
5427         if (ret < 0)
5428                 goto out_register_late_subsys;
5429
5430         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5431                                    inet6_rtm_delroute, NULL, 0);
5432         if (ret < 0)
5433                 goto out_register_late_subsys;
5434
5435         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5436                                    inet6_rtm_getroute, NULL,
5437                                    RTNL_FLAG_DOIT_UNLOCKED);
5438         if (ret < 0)
5439                 goto out_register_late_subsys;
5440
5441         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5442         if (ret)
5443                 goto out_register_late_subsys;
5444
5445         for_each_possible_cpu(cpu) {
5446                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5447
5448                 INIT_LIST_HEAD(&ul->head);
5449                 spin_lock_init(&ul->lock);
5450         }
5451
5452 out:
5453         return ret;
5454
5455 out_register_late_subsys:
5456         rtnl_unregister_all(PF_INET6);
5457         unregister_pernet_subsys(&ip6_route_net_late_ops);
5458 fib6_rules_init:
5459         fib6_rules_cleanup();
5460 xfrm6_init:
5461         xfrm6_fini();
5462 out_fib6_init:
5463         fib6_gc_cleanup();
5464 out_register_subsys:
5465         unregister_pernet_subsys(&ip6_route_net_ops);
5466 out_register_inetpeer:
5467         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5468 out_dst_entries:
5469         dst_entries_destroy(&ip6_dst_blackhole_ops);
5470 out_kmem_cache:
5471         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5472         goto out;
5473 }
5474
5475 void ip6_route_cleanup(void)
5476 {
5477         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5478         unregister_pernet_subsys(&ip6_route_net_late_ops);
5479         fib6_rules_cleanup();
5480         xfrm6_fini();
5481         fib6_gc_cleanup();
5482         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5483         unregister_pernet_subsys(&ip6_route_net_ops);
5484         dst_entries_destroy(&ip6_dst_blackhole_ops);
5485         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5486 }