]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: grab rt->rt6i_ref before allocating pcpu rt
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146
147                 spin_lock_bh(&ul->lock);
148                 list_del(&rt->rt6i_uncached);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
186 {
187         return dst_metrics_write_ptr(rt->dst.from);
188 }
189
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
191 {
192         struct rt6_info *rt = (struct rt6_info *)dst;
193
194         if (rt->rt6i_flags & RTF_PCPU)
195                 return rt6_pcpu_cow_metrics(rt);
196         else if (rt->rt6i_flags & RTF_CACHE)
197                 return NULL;
198         else
199                 return dst_cow_metrics_generic(dst, old);
200 }
201
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203                                              struct sk_buff *skb,
204                                              const void *daddr)
205 {
206         struct in6_addr *p = &rt->rt6i_gateway;
207
208         if (!ipv6_addr_any(p))
209                 return (const void *) p;
210         else if (skb)
211                 return &ipv6_hdr(skb)->daddr;
212         return daddr;
213 }
214
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216                                           struct sk_buff *skb,
217                                           const void *daddr)
218 {
219         struct rt6_info *rt = (struct rt6_info *) dst;
220         struct neighbour *n;
221
222         daddr = choose_neigh_daddr(rt, skb, daddr);
223         n = __ipv6_neigh_lookup(dst->dev, daddr);
224         if (n)
225                 return n;
226         return neigh_create(&nd_tbl, daddr, dst->dev);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(rt, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       ipv6_cow_metrics,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu)
272 {
273 }
274
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276                                       struct sk_buff *skb)
277 {
278 }
279
280 static struct dst_ops ip6_dst_blackhole_ops = {
281         .family                 =       AF_INET6,
282         .destroy                =       ip6_dst_destroy,
283         .check                  =       ip6_dst_check,
284         .mtu                    =       ip6_blackhole_mtu,
285         .default_advmss         =       ip6_default_advmss,
286         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
287         .redirect               =       ip6_rt_blackhole_redirect,
288         .cow_metrics            =       dst_cow_metrics_generic,
289         .neigh_lookup           =       ip6_neigh_lookup,
290 };
291
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293         [RTAX_HOPLIMIT - 1] = 0,
294 };
295
296 static const struct rt6_info ip6_null_entry_template = {
297         .dst = {
298                 .__refcnt       = ATOMIC_INIT(1),
299                 .__use          = 1,
300                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
301                 .error          = -ENETUNREACH,
302                 .input          = ip6_pkt_discard,
303                 .output         = ip6_pkt_discard_out,
304         },
305         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
306         .rt6i_protocol  = RTPROT_KERNEL,
307         .rt6i_metric    = ~(u32) 0,
308         .rt6i_ref       = ATOMIC_INIT(1),
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326 };
327
328 static const struct rt6_info ip6_blk_hole_entry_template = {
329         .dst = {
330                 .__refcnt       = ATOMIC_INIT(1),
331                 .__use          = 1,
332                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
333                 .error          = -EINVAL,
334                 .input          = dst_discard,
335                 .output         = dst_discard_out,
336         },
337         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
338         .rt6i_protocol  = RTPROT_KERNEL,
339         .rt6i_metric    = ~(u32) 0,
340         .rt6i_ref       = ATOMIC_INIT(1),
341 };
342
343 #endif
344
345 static void rt6_info_init(struct rt6_info *rt)
346 {
347         struct dst_entry *dst = &rt->dst;
348
349         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350         INIT_LIST_HEAD(&rt->rt6i_siblings);
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356                                         struct net_device *dev,
357                                         int flags)
358 {
359         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360                                         1, DST_OBSOLETE_FORCE_CHK, flags);
361
362         if (rt)
363                 rt6_info_init(rt);
364
365         return rt;
366 }
367
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369                                struct net_device *dev,
370                                int flags)
371 {
372         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373
374         if (rt) {
375                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
376                 if (rt->rt6i_pcpu) {
377                         int cpu;
378
379                         for_each_possible_cpu(cpu) {
380                                 struct rt6_info **p;
381
382                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383                                 /* no one shares rt */
384                                 *p =  NULL;
385                         }
386                 } else {
387                         dst_release_immediate(&rt->dst);
388                         return NULL;
389                 }
390         }
391
392         return rt;
393 }
394 EXPORT_SYMBOL(ip6_dst_alloc);
395
396 static void ip6_dst_destroy(struct dst_entry *dst)
397 {
398         struct rt6_info *rt = (struct rt6_info *)dst;
399         struct rt6_exception_bucket *bucket;
400         struct dst_entry *from = dst->from;
401         struct inet6_dev *idev;
402
403         dst_destroy_metrics_generic(dst);
404         free_percpu(rt->rt6i_pcpu);
405         rt6_uncached_list_del(rt);
406
407         idev = rt->rt6i_idev;
408         if (idev) {
409                 rt->rt6i_idev = NULL;
410                 in6_dev_put(idev);
411         }
412         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
413         if (bucket) {
414                 rt->rt6i_exception_bucket = NULL;
415                 kfree(bucket);
416         }
417
418         dst->from = NULL;
419         dst_release(from);
420 }
421
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
423                            int how)
424 {
425         struct rt6_info *rt = (struct rt6_info *)dst;
426         struct inet6_dev *idev = rt->rt6i_idev;
427         struct net_device *loopback_dev =
428                 dev_net(dev)->loopback_dev;
429
430         if (idev && idev->dev != loopback_dev) {
431                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
432                 if (loopback_idev) {
433                         rt->rt6i_idev = loopback_idev;
434                         in6_dev_put(idev);
435                 }
436         }
437 }
438
439 static bool __rt6_check_expired(const struct rt6_info *rt)
440 {
441         if (rt->rt6i_flags & RTF_EXPIRES)
442                 return time_after(jiffies, rt->dst.expires);
443         else
444                 return false;
445 }
446
447 static bool rt6_check_expired(const struct rt6_info *rt)
448 {
449         if (rt->rt6i_flags & RTF_EXPIRES) {
450                 if (time_after(jiffies, rt->dst.expires))
451                         return true;
452         } else if (rt->dst.from) {
453                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454                        rt6_check_expired((struct rt6_info *)rt->dst.from);
455         }
456         return false;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         /* We might have already computed the hash for ICMPv6 errors. In such
467          * case it will always be non-zero. Otherwise now is the time to do it.
468          */
469         if (!fl6->mp_hash)
470                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
471
472         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473         /* Don't change the route, if route_choosen == 0
474          * (siblings does not include ourself)
475          */
476         if (route_choosen)
477                 list_for_each_entry_safe(sibling, next_sibling,
478                                 &match->rt6i_siblings, rt6i_siblings) {
479                         route_choosen--;
480                         if (route_choosen == 0) {
481                                 if (rt6_score_route(sibling, oif, strict) < 0)
482                                         break;
483                                 match = sibling;
484                                 break;
485                         }
486                 }
487         return match;
488 }
489
490 /*
491  *      Route lookup. Any table->tb6_lock is implied.
492  */
493
494 static inline struct rt6_info *rt6_device_match(struct net *net,
495                                                     struct rt6_info *rt,
496                                                     const struct in6_addr *saddr,
497                                                     int oif,
498                                                     int flags)
499 {
500         struct rt6_info *local = NULL;
501         struct rt6_info *sprt;
502
503         if (!oif && ipv6_addr_any(saddr))
504                 goto out;
505
506         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507                 struct net_device *dev = sprt->dst.dev;
508
509                 if (oif) {
510                         if (dev->ifindex == oif)
511                                 return sprt;
512                         if (dev->flags & IFF_LOOPBACK) {
513                                 if (!sprt->rt6i_idev ||
514                                     sprt->rt6i_idev->dev->ifindex != oif) {
515                                         if (flags & RT6_LOOKUP_F_IFACE)
516                                                 continue;
517                                         if (local &&
518                                             local->rt6i_idev->dev->ifindex == oif)
519                                                 continue;
520                                 }
521                                 local = sprt;
522                         }
523                 } else {
524                         if (ipv6_chk_addr(net, saddr, dev,
525                                           flags & RT6_LOOKUP_F_IFACE))
526                                 return sprt;
527                 }
528         }
529
530         if (oif) {
531                 if (local)
532                         return local;
533
534                 if (flags & RT6_LOOKUP_F_IFACE)
535                         return net->ipv6.ip6_null_entry;
536         }
537 out:
538         return rt;
539 }
540
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543         struct work_struct work;
544         struct in6_addr target;
545         struct net_device *dev;
546 };
547
548 static void rt6_probe_deferred(struct work_struct *w)
549 {
550         struct in6_addr mcaddr;
551         struct __rt6_probe_work *work =
552                 container_of(w, struct __rt6_probe_work, work);
553
554         addrconf_addr_solict_mult(&work->target, &mcaddr);
555         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556         dev_put(work->dev);
557         kfree(work);
558 }
559
560 static void rt6_probe(struct rt6_info *rt)
561 {
562         struct __rt6_probe_work *work;
563         struct neighbour *neigh;
564         /*
565          * Okay, this does not seem to be appropriate
566          * for now, however, we need to check if it
567          * is really so; aka Router Reachability Probing.
568          *
569          * Router Reachability Probe MUST be rate-limited
570          * to no more than one per minute.
571          */
572         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
573                 return;
574         rcu_read_lock_bh();
575         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
576         if (neigh) {
577                 if (neigh->nud_state & NUD_VALID)
578                         goto out;
579
580                 work = NULL;
581                 write_lock(&neigh->lock);
582                 if (!(neigh->nud_state & NUD_VALID) &&
583                     time_after(jiffies,
584                                neigh->updated +
585                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
586                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
587                         if (work)
588                                 __neigh_set_probe_once(neigh);
589                 }
590                 write_unlock(&neigh->lock);
591         } else {
592                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
593         }
594
595         if (work) {
596                 INIT_WORK(&work->work, rt6_probe_deferred);
597                 work->target = rt->rt6i_gateway;
598                 dev_hold(rt->dst.dev);
599                 work->dev = rt->dst.dev;
600                 schedule_work(&work->work);
601         }
602
603 out:
604         rcu_read_unlock_bh();
605 }
606 #else
607 static inline void rt6_probe(struct rt6_info *rt)
608 {
609 }
610 #endif
611
612 /*
613  * Default Router Selection (RFC 2461 6.3.6)
614  */
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
616 {
617         struct net_device *dev = rt->dst.dev;
618         if (!oif || dev->ifindex == oif)
619                 return 2;
620         if ((dev->flags & IFF_LOOPBACK) &&
621             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622                 return 1;
623         return 0;
624 }
625
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
627 {
628         struct neighbour *neigh;
629         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
630
631         if (rt->rt6i_flags & RTF_NONEXTHOP ||
632             !(rt->rt6i_flags & RTF_GATEWAY))
633                 return RT6_NUD_SUCCEED;
634
635         rcu_read_lock_bh();
636         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
637         if (neigh) {
638                 read_lock(&neigh->lock);
639                 if (neigh->nud_state & NUD_VALID)
640                         ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642                 else if (!(neigh->nud_state & NUD_FAILED))
643                         ret = RT6_NUD_SUCCEED;
644                 else
645                         ret = RT6_NUD_FAIL_PROBE;
646 #endif
647                 read_unlock(&neigh->lock);
648         } else {
649                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
651         }
652         rcu_read_unlock_bh();
653
654         return ret;
655 }
656
657 static int rt6_score_route(struct rt6_info *rt, int oif,
658                            int strict)
659 {
660         int m;
661
662         m = rt6_check_dev(rt, oif);
663         if (!m && (strict & RT6_LOOKUP_F_IFACE))
664                 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
667 #endif
668         if (strict & RT6_LOOKUP_F_REACHABLE) {
669                 int n = rt6_check_neigh(rt);
670                 if (n < 0)
671                         return n;
672         }
673         return m;
674 }
675
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677                                    int *mpri, struct rt6_info *match,
678                                    bool *do_rr)
679 {
680         int m;
681         bool match_do_rr = false;
682         struct inet6_dev *idev = rt->rt6i_idev;
683         struct net_device *dev = rt->dst.dev;
684
685         if (dev && !netif_carrier_ok(dev) &&
686             idev->cnf.ignore_routes_with_linkdown &&
687             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688                 goto out;
689
690         if (rt6_check_expired(rt))
691                 goto out;
692
693         m = rt6_score_route(rt, oif, strict);
694         if (m == RT6_NUD_FAIL_DO_RR) {
695                 match_do_rr = true;
696                 m = 0; /* lowest valid score */
697         } else if (m == RT6_NUD_FAIL_HARD) {
698                 goto out;
699         }
700
701         if (strict & RT6_LOOKUP_F_REACHABLE)
702                 rt6_probe(rt);
703
704         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
705         if (m > *mpri) {
706                 *do_rr = match_do_rr;
707                 *mpri = m;
708                 match = rt;
709         }
710 out:
711         return match;
712 }
713
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715                                      struct rt6_info *rr_head,
716                                      u32 metric, int oif, int strict,
717                                      bool *do_rr)
718 {
719         struct rt6_info *rt, *match, *cont;
720         int mpri = -1;
721
722         match = NULL;
723         cont = NULL;
724         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
725                 if (rt->rt6i_metric != metric) {
726                         cont = rt;
727                         break;
728                 }
729
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731         }
732
733         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rt->dst.rt6_next)
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
752 {
753         struct rt6_info *match, *rt0;
754         struct net *net;
755         bool do_rr = false;
756
757         rt0 = fn->rr_ptr;
758         if (!rt0)
759                 fn->rr_ptr = rt0 = fn->leaf;
760
761         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct rt6_info *next = rt0->dst.rt6_next;
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
769                         next = fn->leaf;
770
771                 if (next != rt0)
772                         fn->rr_ptr = next;
773         }
774
775         net = dev_net(rt0->dst.dev);
776         return match ? match : net->ipv6.ip6_null_entry;
777 }
778
779 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
780 {
781         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
782 }
783
784 #ifdef CONFIG_IPV6_ROUTE_INFO
785 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
786                   const struct in6_addr *gwaddr)
787 {
788         struct net *net = dev_net(dev);
789         struct route_info *rinfo = (struct route_info *) opt;
790         struct in6_addr prefix_buf, *prefix;
791         unsigned int pref;
792         unsigned long lifetime;
793         struct rt6_info *rt;
794
795         if (len < sizeof(struct route_info)) {
796                 return -EINVAL;
797         }
798
799         /* Sanity check for prefix_len and length */
800         if (rinfo->length > 3) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 128) {
803                 return -EINVAL;
804         } else if (rinfo->prefix_len > 64) {
805                 if (rinfo->length < 2) {
806                         return -EINVAL;
807                 }
808         } else if (rinfo->prefix_len > 0) {
809                 if (rinfo->length < 1) {
810                         return -EINVAL;
811                 }
812         }
813
814         pref = rinfo->route_pref;
815         if (pref == ICMPV6_ROUTER_PREF_INVALID)
816                 return -EINVAL;
817
818         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
819
820         if (rinfo->length == 3)
821                 prefix = (struct in6_addr *)rinfo->prefix;
822         else {
823                 /* this function is safe */
824                 ipv6_addr_prefix(&prefix_buf,
825                                  (struct in6_addr *)rinfo->prefix,
826                                  rinfo->prefix_len);
827                 prefix = &prefix_buf;
828         }
829
830         if (rinfo->prefix_len == 0)
831                 rt = rt6_get_dflt_router(gwaddr, dev);
832         else
833                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
834                                         gwaddr, dev);
835
836         if (rt && !lifetime) {
837                 ip6_del_rt(rt);
838                 rt = NULL;
839         }
840
841         if (!rt && lifetime)
842                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
843                                         dev, pref);
844         else if (rt)
845                 rt->rt6i_flags = RTF_ROUTEINFO |
846                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
847
848         if (rt) {
849                 if (!addrconf_finite_timeout(lifetime))
850                         rt6_clean_expires(rt);
851                 else
852                         rt6_set_expires(rt, jiffies + HZ * lifetime);
853
854                 ip6_rt_put(rt);
855         }
856         return 0;
857 }
858 #endif
859
860 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
861                                         struct in6_addr *saddr)
862 {
863         struct fib6_node *pn;
864         while (1) {
865                 if (fn->fn_flags & RTN_TL_ROOT)
866                         return NULL;
867                 pn = fn->parent;
868                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
869                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
870                 else
871                         fn = pn;
872                 if (fn->fn_flags & RTN_RTINFO)
873                         return fn;
874         }
875 }
876
877 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
878                                              struct fib6_table *table,
879                                              struct flowi6 *fl6, int flags)
880 {
881         struct rt6_info *rt, *rt_cache;
882         struct fib6_node *fn;
883
884         read_lock_bh(&table->tb6_lock);
885         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
886 restart:
887         rt = fn->leaf;
888         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
889         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
890                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
891         if (rt == net->ipv6.ip6_null_entry) {
892                 fn = fib6_backtrack(fn, &fl6->saddr);
893                 if (fn)
894                         goto restart;
895         }
896         /* Search through exception table */
897         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
898         if (rt_cache)
899                 rt = rt_cache;
900
901         dst_use(&rt->dst, jiffies);
902         read_unlock_bh(&table->tb6_lock);
903
904         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
905
906         return rt;
907
908 }
909
910 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
911                                     int flags)
912 {
913         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
914 }
915 EXPORT_SYMBOL_GPL(ip6_route_lookup);
916
917 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
918                             const struct in6_addr *saddr, int oif, int strict)
919 {
920         struct flowi6 fl6 = {
921                 .flowi6_oif = oif,
922                 .daddr = *daddr,
923         };
924         struct dst_entry *dst;
925         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
926
927         if (saddr) {
928                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
929                 flags |= RT6_LOOKUP_F_HAS_SADDR;
930         }
931
932         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
933         if (dst->error == 0)
934                 return (struct rt6_info *) dst;
935
936         dst_release(dst);
937
938         return NULL;
939 }
940 EXPORT_SYMBOL(rt6_lookup);
941
942 /* ip6_ins_rt is called with FREE table->tb6_lock.
943  * It takes new route entry, the addition fails by any reason the
944  * route is released.
945  * Caller must hold dst before calling it.
946  */
947
948 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
949                         struct mx6_config *mxc,
950                         struct netlink_ext_ack *extack)
951 {
952         int err;
953         struct fib6_table *table;
954
955         table = rt->rt6i_table;
956         write_lock_bh(&table->tb6_lock);
957         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
958         write_unlock_bh(&table->tb6_lock);
959
960         return err;
961 }
962
963 int ip6_ins_rt(struct rt6_info *rt)
964 {
965         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
966         struct mx6_config mxc = { .mx = NULL, };
967
968         /* Hold dst to account for the reference from the fib6 tree */
969         dst_hold(&rt->dst);
970         return __ip6_ins_rt(rt, &info, &mxc, NULL);
971 }
972
973 /* called with rcu_lock held */
974 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
975 {
976         struct net_device *dev = rt->dst.dev;
977
978         if (rt->rt6i_flags & RTF_LOCAL) {
979                 /* for copies of local routes, dst->dev needs to be the
980                  * device if it is a master device, the master device if
981                  * device is enslaved, and the loopback as the default
982                  */
983                 if (netif_is_l3_slave(dev) &&
984                     !rt6_need_strict(&rt->rt6i_dst.addr))
985                         dev = l3mdev_master_dev_rcu(dev);
986                 else if (!netif_is_l3_master(dev))
987                         dev = dev_net(dev)->loopback_dev;
988                 /* last case is netif_is_l3_master(dev) is true in which
989                  * case we want dev returned to be dev
990                  */
991         }
992
993         return dev;
994 }
995
996 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
997                                            const struct in6_addr *daddr,
998                                            const struct in6_addr *saddr)
999 {
1000         struct net_device *dev;
1001         struct rt6_info *rt;
1002
1003         /*
1004          *      Clone the route.
1005          */
1006
1007         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1008                 ort = (struct rt6_info *)ort->dst.from;
1009
1010         rcu_read_lock();
1011         dev = ip6_rt_get_dev_rcu(ort);
1012         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1013         rcu_read_unlock();
1014         if (!rt)
1015                 return NULL;
1016
1017         ip6_rt_copy_init(rt, ort);
1018         rt->rt6i_flags |= RTF_CACHE;
1019         rt->rt6i_metric = 0;
1020         rt->dst.flags |= DST_HOST;
1021         rt->rt6i_dst.addr = *daddr;
1022         rt->rt6i_dst.plen = 128;
1023
1024         if (!rt6_is_gw_or_nonexthop(ort)) {
1025                 if (ort->rt6i_dst.plen != 128 &&
1026                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1027                         rt->rt6i_flags |= RTF_ANYCAST;
1028 #ifdef CONFIG_IPV6_SUBTREES
1029                 if (rt->rt6i_src.plen && saddr) {
1030                         rt->rt6i_src.addr = *saddr;
1031                         rt->rt6i_src.plen = 128;
1032                 }
1033 #endif
1034         }
1035
1036         return rt;
1037 }
1038
1039 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1040 {
1041         struct net_device *dev;
1042         struct rt6_info *pcpu_rt;
1043
1044         rcu_read_lock();
1045         dev = ip6_rt_get_dev_rcu(rt);
1046         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1047         rcu_read_unlock();
1048         if (!pcpu_rt)
1049                 return NULL;
1050         ip6_rt_copy_init(pcpu_rt, rt);
1051         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1052         pcpu_rt->rt6i_flags |= RTF_PCPU;
1053         return pcpu_rt;
1054 }
1055
1056 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1057 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1058 {
1059         struct rt6_info *pcpu_rt, **p;
1060
1061         p = this_cpu_ptr(rt->rt6i_pcpu);
1062         pcpu_rt = *p;
1063
1064         if (pcpu_rt) {
1065                 dst_hold(&pcpu_rt->dst);
1066                 rt6_dst_from_metrics_check(pcpu_rt);
1067         }
1068         return pcpu_rt;
1069 }
1070
1071 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1072 {
1073         struct rt6_info *pcpu_rt, *prev, **p;
1074
1075         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1076         if (!pcpu_rt) {
1077                 struct net *net = dev_net(rt->dst.dev);
1078
1079                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1080                 return net->ipv6.ip6_null_entry;
1081         }
1082
1083         dst_hold(&pcpu_rt->dst);
1084         p = this_cpu_ptr(rt->rt6i_pcpu);
1085         prev = cmpxchg(p, NULL, pcpu_rt);
1086         if (prev) {
1087                 /* If someone did it before us, return prev instead */
1088                 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1089                 dst_release_immediate(&pcpu_rt->dst);
1090                 /* release refcnt taken by above dst_hold() */
1091                 dst_release_immediate(&pcpu_rt->dst);
1092                 dst_hold(&prev->dst);
1093                 pcpu_rt = prev;
1094         }
1095
1096         rt6_dst_from_metrics_check(pcpu_rt);
1097         return pcpu_rt;
1098 }
1099
1100 /* exception hash table implementation
1101  */
1102 static DEFINE_SPINLOCK(rt6_exception_lock);
1103
1104 /* Remove rt6_ex from hash table and free the memory
1105  * Caller must hold rt6_exception_lock
1106  */
1107 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1108                                  struct rt6_exception *rt6_ex)
1109 {
1110         if (!bucket || !rt6_ex)
1111                 return;
1112         rt6_ex->rt6i->rt6i_node = NULL;
1113         hlist_del_rcu(&rt6_ex->hlist);
1114         rt6_release(rt6_ex->rt6i);
1115         kfree_rcu(rt6_ex, rcu);
1116         WARN_ON_ONCE(!bucket->depth);
1117         bucket->depth--;
1118 }
1119
1120 /* Remove oldest rt6_ex in bucket and free the memory
1121  * Caller must hold rt6_exception_lock
1122  */
1123 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1124 {
1125         struct rt6_exception *rt6_ex, *oldest = NULL;
1126
1127         if (!bucket)
1128                 return;
1129
1130         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1131                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1132                         oldest = rt6_ex;
1133         }
1134         rt6_remove_exception(bucket, oldest);
1135 }
1136
1137 static u32 rt6_exception_hash(const struct in6_addr *dst,
1138                               const struct in6_addr *src)
1139 {
1140         static u32 seed __read_mostly;
1141         u32 val;
1142
1143         net_get_random_once(&seed, sizeof(seed));
1144         val = jhash(dst, sizeof(*dst), seed);
1145
1146 #ifdef CONFIG_IPV6_SUBTREES
1147         if (src)
1148                 val = jhash(src, sizeof(*src), val);
1149 #endif
1150         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1151 }
1152
1153 /* Helper function to find the cached rt in the hash table
1154  * and update bucket pointer to point to the bucket for this
1155  * (daddr, saddr) pair
1156  * Caller must hold rt6_exception_lock
1157  */
1158 static struct rt6_exception *
1159 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1160                               const struct in6_addr *daddr,
1161                               const struct in6_addr *saddr)
1162 {
1163         struct rt6_exception *rt6_ex;
1164         u32 hval;
1165
1166         if (!(*bucket) || !daddr)
1167                 return NULL;
1168
1169         hval = rt6_exception_hash(daddr, saddr);
1170         *bucket += hval;
1171
1172         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1173                 struct rt6_info *rt6 = rt6_ex->rt6i;
1174                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1175
1176 #ifdef CONFIG_IPV6_SUBTREES
1177                 if (matched && saddr)
1178                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1179 #endif
1180                 if (matched)
1181                         return rt6_ex;
1182         }
1183         return NULL;
1184 }
1185
1186 /* Helper function to find the cached rt in the hash table
1187  * and update bucket pointer to point to the bucket for this
1188  * (daddr, saddr) pair
1189  * Caller must hold rcu_read_lock()
1190  */
1191 static struct rt6_exception *
1192 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1193                          const struct in6_addr *daddr,
1194                          const struct in6_addr *saddr)
1195 {
1196         struct rt6_exception *rt6_ex;
1197         u32 hval;
1198
1199         WARN_ON_ONCE(!rcu_read_lock_held());
1200
1201         if (!(*bucket) || !daddr)
1202                 return NULL;
1203
1204         hval = rt6_exception_hash(daddr, saddr);
1205         *bucket += hval;
1206
1207         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1208                 struct rt6_info *rt6 = rt6_ex->rt6i;
1209                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1210
1211 #ifdef CONFIG_IPV6_SUBTREES
1212                 if (matched && saddr)
1213                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1214 #endif
1215                 if (matched)
1216                         return rt6_ex;
1217         }
1218         return NULL;
1219 }
1220
1221 static int rt6_insert_exception(struct rt6_info *nrt,
1222                                 struct rt6_info *ort)
1223 {
1224         struct rt6_exception_bucket *bucket;
1225         struct in6_addr *src_key = NULL;
1226         struct rt6_exception *rt6_ex;
1227         int err = 0;
1228
1229         /* ort can't be a cache or pcpu route */
1230         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1231                 ort = (struct rt6_info *)ort->dst.from;
1232         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1233
1234         spin_lock_bh(&rt6_exception_lock);
1235
1236         if (ort->exception_bucket_flushed) {
1237                 err = -EINVAL;
1238                 goto out;
1239         }
1240
1241         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1242                                         lockdep_is_held(&rt6_exception_lock));
1243         if (!bucket) {
1244                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1245                                  GFP_ATOMIC);
1246                 if (!bucket) {
1247                         err = -ENOMEM;
1248                         goto out;
1249                 }
1250                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1251         }
1252
1253 #ifdef CONFIG_IPV6_SUBTREES
1254         /* rt6i_src.plen != 0 indicates ort is in subtree
1255          * and exception table is indexed by a hash of
1256          * both rt6i_dst and rt6i_src.
1257          * Otherwise, the exception table is indexed by
1258          * a hash of only rt6i_dst.
1259          */
1260         if (ort->rt6i_src.plen)
1261                 src_key = &nrt->rt6i_src.addr;
1262 #endif
1263
1264         /* Update rt6i_prefsrc as it could be changed
1265          * in rt6_remove_prefsrc()
1266          */
1267         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1268         /* rt6_mtu_change() might lower mtu on ort.
1269          * Only insert this exception route if its mtu
1270          * is less than ort's mtu value.
1271          */
1272         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1273                 err = -EINVAL;
1274                 goto out;
1275         }
1276
1277         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1278                                                src_key);
1279         if (rt6_ex)
1280                 rt6_remove_exception(bucket, rt6_ex);
1281
1282         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1283         if (!rt6_ex) {
1284                 err = -ENOMEM;
1285                 goto out;
1286         }
1287         rt6_ex->rt6i = nrt;
1288         rt6_ex->stamp = jiffies;
1289         atomic_inc(&nrt->rt6i_ref);
1290         nrt->rt6i_node = ort->rt6i_node;
1291         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1292         bucket->depth++;
1293
1294         if (bucket->depth > FIB6_MAX_DEPTH)
1295                 rt6_exception_remove_oldest(bucket);
1296
1297 out:
1298         spin_unlock_bh(&rt6_exception_lock);
1299
1300         /* Update fn->fn_sernum to invalidate all cached dst */
1301         if (!err)
1302                 fib6_update_sernum(ort);
1303
1304         return err;
1305 }
1306
1307 void rt6_flush_exceptions(struct rt6_info *rt)
1308 {
1309         struct rt6_exception_bucket *bucket;
1310         struct rt6_exception *rt6_ex;
1311         struct hlist_node *tmp;
1312         int i;
1313
1314         spin_lock_bh(&rt6_exception_lock);
1315         /* Prevent rt6_insert_exception() to recreate the bucket list */
1316         rt->exception_bucket_flushed = 1;
1317
1318         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1319                                     lockdep_is_held(&rt6_exception_lock));
1320         if (!bucket)
1321                 goto out;
1322
1323         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1324                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1325                         rt6_remove_exception(bucket, rt6_ex);
1326                 WARN_ON_ONCE(bucket->depth);
1327                 bucket++;
1328         }
1329
1330 out:
1331         spin_unlock_bh(&rt6_exception_lock);
1332 }
1333
1334 /* Find cached rt in the hash table inside passed in rt
1335  * Caller has to hold rcu_read_lock()
1336  */
1337 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1338                                            struct in6_addr *daddr,
1339                                            struct in6_addr *saddr)
1340 {
1341         struct rt6_exception_bucket *bucket;
1342         struct in6_addr *src_key = NULL;
1343         struct rt6_exception *rt6_ex;
1344         struct rt6_info *res = NULL;
1345
1346         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1347
1348 #ifdef CONFIG_IPV6_SUBTREES
1349         /* rt6i_src.plen != 0 indicates rt is in subtree
1350          * and exception table is indexed by a hash of
1351          * both rt6i_dst and rt6i_src.
1352          * Otherwise, the exception table is indexed by
1353          * a hash of only rt6i_dst.
1354          */
1355         if (rt->rt6i_src.plen)
1356                 src_key = saddr;
1357 #endif
1358         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1359
1360         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1361                 res = rt6_ex->rt6i;
1362
1363         return res;
1364 }
1365
1366 /* Remove the passed in cached rt from the hash table that contains it */
1367 int rt6_remove_exception_rt(struct rt6_info *rt)
1368 {
1369         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1370         struct rt6_exception_bucket *bucket;
1371         struct in6_addr *src_key = NULL;
1372         struct rt6_exception *rt6_ex;
1373         int err;
1374
1375         if (!from ||
1376             !(rt->rt6i_flags | RTF_CACHE))
1377                 return -EINVAL;
1378
1379         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1380                 return -ENOENT;
1381
1382         spin_lock_bh(&rt6_exception_lock);
1383         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1384                                     lockdep_is_held(&rt6_exception_lock));
1385 #ifdef CONFIG_IPV6_SUBTREES
1386         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1387          * and exception table is indexed by a hash of
1388          * both rt6i_dst and rt6i_src.
1389          * Otherwise, the exception table is indexed by
1390          * a hash of only rt6i_dst.
1391          */
1392         if (from->rt6i_src.plen)
1393                 src_key = &rt->rt6i_src.addr;
1394 #endif
1395         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1396                                                &rt->rt6i_dst.addr,
1397                                                src_key);
1398         if (rt6_ex) {
1399                 rt6_remove_exception(bucket, rt6_ex);
1400                 err = 0;
1401         } else {
1402                 err = -ENOENT;
1403         }
1404
1405         spin_unlock_bh(&rt6_exception_lock);
1406         return err;
1407 }
1408
1409 /* Find rt6_ex which contains the passed in rt cache and
1410  * refresh its stamp
1411  */
1412 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1413 {
1414         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1415         struct rt6_exception_bucket *bucket;
1416         struct in6_addr *src_key = NULL;
1417         struct rt6_exception *rt6_ex;
1418
1419         if (!from ||
1420             !(rt->rt6i_flags | RTF_CACHE))
1421                 return;
1422
1423         rcu_read_lock();
1424         bucket = rcu_dereference(from->rt6i_exception_bucket);
1425
1426 #ifdef CONFIG_IPV6_SUBTREES
1427         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1428          * and exception table is indexed by a hash of
1429          * both rt6i_dst and rt6i_src.
1430          * Otherwise, the exception table is indexed by
1431          * a hash of only rt6i_dst.
1432          */
1433         if (from->rt6i_src.plen)
1434                 src_key = &rt->rt6i_src.addr;
1435 #endif
1436         rt6_ex = __rt6_find_exception_rcu(&bucket,
1437                                           &rt->rt6i_dst.addr,
1438                                           src_key);
1439         if (rt6_ex)
1440                 rt6_ex->stamp = jiffies;
1441
1442         rcu_read_unlock();
1443 }
1444
1445 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1446 {
1447         struct rt6_exception_bucket *bucket;
1448         struct rt6_exception *rt6_ex;
1449         int i;
1450
1451         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1452                                         lockdep_is_held(&rt6_exception_lock));
1453
1454         if (bucket) {
1455                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1456                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1457                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1458                         }
1459                         bucket++;
1460                 }
1461         }
1462 }
1463
1464 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1465 {
1466         struct rt6_exception_bucket *bucket;
1467         struct rt6_exception *rt6_ex;
1468         int i;
1469
1470         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1471                                         lockdep_is_held(&rt6_exception_lock));
1472
1473         if (bucket) {
1474                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1475                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1476                                 struct rt6_info *entry = rt6_ex->rt6i;
1477                                 /* For RTF_CACHE with rt6i_pmtu == 0
1478                                  * (i.e. a redirected route),
1479                                  * the metrics of its rt->dst.from has already
1480                                  * been updated.
1481                                  */
1482                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1483                                         entry->rt6i_pmtu = mtu;
1484                         }
1485                         bucket++;
1486                 }
1487         }
1488 }
1489
1490 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1491
1492 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1493                                         struct in6_addr *gateway)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1501                 return;
1502
1503         spin_lock_bh(&rt6_exception_lock);
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                      lockdep_is_held(&rt6_exception_lock));
1506
1507         if (bucket) {
1508                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509                         hlist_for_each_entry_safe(rt6_ex, tmp,
1510                                                   &bucket->chain, hlist) {
1511                                 struct rt6_info *entry = rt6_ex->rt6i;
1512
1513                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1514                                     RTF_CACHE_GATEWAY &&
1515                                     ipv6_addr_equal(gateway,
1516                                                     &entry->rt6i_gateway)) {
1517                                         rt6_remove_exception(bucket, rt6_ex);
1518                                 }
1519                         }
1520                         bucket++;
1521                 }
1522         }
1523
1524         spin_unlock_bh(&rt6_exception_lock);
1525 }
1526
1527 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1528                                       struct rt6_exception *rt6_ex,
1529                                       struct fib6_gc_args *gc_args,
1530                                       unsigned long now)
1531 {
1532         struct rt6_info *rt = rt6_ex->rt6i;
1533
1534         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1535             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1536                 RT6_TRACE("aging clone %p\n", rt);
1537                 rt6_remove_exception(bucket, rt6_ex);
1538                 return;
1539         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1540                 struct neighbour *neigh;
1541                 __u8 neigh_flags = 0;
1542
1543                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1544                 if (neigh) {
1545                         neigh_flags = neigh->flags;
1546                         neigh_release(neigh);
1547                 }
1548                 if (!(neigh_flags & NTF_ROUTER)) {
1549                         RT6_TRACE("purging route %p via non-router but gateway\n",
1550                                   rt);
1551                         rt6_remove_exception(bucket, rt6_ex);
1552                         return;
1553                 }
1554         }
1555         gc_args->more++;
1556 }
1557
1558 void rt6_age_exceptions(struct rt6_info *rt,
1559                         struct fib6_gc_args *gc_args,
1560                         unsigned long now)
1561 {
1562         struct rt6_exception_bucket *bucket;
1563         struct rt6_exception *rt6_ex;
1564         struct hlist_node *tmp;
1565         int i;
1566
1567         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1568                 return;
1569
1570         spin_lock_bh(&rt6_exception_lock);
1571         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1572                                     lockdep_is_held(&rt6_exception_lock));
1573
1574         if (bucket) {
1575                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1576                         hlist_for_each_entry_safe(rt6_ex, tmp,
1577                                                   &bucket->chain, hlist) {
1578                                 rt6_age_examine_exception(bucket, rt6_ex,
1579                                                           gc_args, now);
1580                         }
1581                         bucket++;
1582                 }
1583         }
1584         spin_unlock_bh(&rt6_exception_lock);
1585 }
1586
1587 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1588                                int oif, struct flowi6 *fl6, int flags)
1589 {
1590         struct fib6_node *fn, *saved_fn;
1591         struct rt6_info *rt, *rt_cache;
1592         int strict = 0;
1593
1594         strict |= flags & RT6_LOOKUP_F_IFACE;
1595         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1596         if (net->ipv6.devconf_all->forwarding == 0)
1597                 strict |= RT6_LOOKUP_F_REACHABLE;
1598
1599         read_lock_bh(&table->tb6_lock);
1600
1601         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1602         saved_fn = fn;
1603
1604         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1605                 oif = 0;
1606
1607 redo_rt6_select:
1608         rt = rt6_select(fn, oif, strict);
1609         if (rt->rt6i_nsiblings)
1610                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1611         if (rt == net->ipv6.ip6_null_entry) {
1612                 fn = fib6_backtrack(fn, &fl6->saddr);
1613                 if (fn)
1614                         goto redo_rt6_select;
1615                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1616                         /* also consider unreachable route */
1617                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1618                         fn = saved_fn;
1619                         goto redo_rt6_select;
1620                 }
1621         }
1622
1623         /*Search through exception table */
1624         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1625         if (rt_cache)
1626                 rt = rt_cache;
1627
1628         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1629                 dst_use(&rt->dst, jiffies);
1630                 read_unlock_bh(&table->tb6_lock);
1631
1632                 rt6_dst_from_metrics_check(rt);
1633
1634                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1635                 return rt;
1636         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1637                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1638                 /* Create a RTF_CACHE clone which will not be
1639                  * owned by the fib6 tree.  It is for the special case where
1640                  * the daddr in the skb during the neighbor look-up is different
1641                  * from the fl6->daddr used to look-up route here.
1642                  */
1643
1644                 struct rt6_info *uncached_rt;
1645
1646                 dst_use(&rt->dst, jiffies);
1647                 read_unlock_bh(&table->tb6_lock);
1648
1649                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1650                 dst_release(&rt->dst);
1651
1652                 if (uncached_rt) {
1653                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1654                          * No need for another dst_hold()
1655                          */
1656                         rt6_uncached_list_add(uncached_rt);
1657                 } else {
1658                         uncached_rt = net->ipv6.ip6_null_entry;
1659                         dst_hold(&uncached_rt->dst);
1660                 }
1661
1662                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1663                 return uncached_rt;
1664
1665         } else {
1666                 /* Get a percpu copy */
1667
1668                 struct rt6_info *pcpu_rt;
1669
1670                 rt->dst.lastuse = jiffies;
1671                 rt->dst.__use++;
1672                 pcpu_rt = rt6_get_pcpu_route(rt);
1673
1674                 if (pcpu_rt) {
1675                         read_unlock_bh(&table->tb6_lock);
1676                 } else {
1677                         /* atomic_inc_not_zero() is needed when using rcu */
1678                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1679                                 /* We have to do the read_unlock first
1680                                  * because rt6_make_pcpu_route() may trigger
1681                                  * ip6_dst_gc() which will take the write_lock.
1682                                  *
1683                                  * No dst_hold() on rt is needed because grabbing
1684                                  * rt->rt6i_ref makes sure rt can't be released.
1685                                  */
1686                                 read_unlock_bh(&table->tb6_lock);
1687                                 pcpu_rt = rt6_make_pcpu_route(rt);
1688                                 rt6_release(rt);
1689                         } else {
1690                                 /* rt is already removed from tree */
1691                                 read_unlock_bh(&table->tb6_lock);
1692                                 pcpu_rt = net->ipv6.ip6_null_entry;
1693                                 dst_hold(&pcpu_rt->dst);
1694                         }
1695                 }
1696
1697                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1698                 return pcpu_rt;
1699         }
1700 }
1701 EXPORT_SYMBOL_GPL(ip6_pol_route);
1702
1703 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1704                                             struct flowi6 *fl6, int flags)
1705 {
1706         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1707 }
1708
1709 struct dst_entry *ip6_route_input_lookup(struct net *net,
1710                                          struct net_device *dev,
1711                                          struct flowi6 *fl6, int flags)
1712 {
1713         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1714                 flags |= RT6_LOOKUP_F_IFACE;
1715
1716         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1717 }
1718 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1719
1720 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1721                                   struct flow_keys *keys)
1722 {
1723         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1724         const struct ipv6hdr *key_iph = outer_iph;
1725         const struct ipv6hdr *inner_iph;
1726         const struct icmp6hdr *icmph;
1727         struct ipv6hdr _inner_iph;
1728
1729         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1730                 goto out;
1731
1732         icmph = icmp6_hdr(skb);
1733         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1734             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1735             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1736             icmph->icmp6_type != ICMPV6_PARAMPROB)
1737                 goto out;
1738
1739         inner_iph = skb_header_pointer(skb,
1740                                        skb_transport_offset(skb) + sizeof(*icmph),
1741                                        sizeof(_inner_iph), &_inner_iph);
1742         if (!inner_iph)
1743                 goto out;
1744
1745         key_iph = inner_iph;
1746 out:
1747         memset(keys, 0, sizeof(*keys));
1748         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1749         keys->addrs.v6addrs.src = key_iph->saddr;
1750         keys->addrs.v6addrs.dst = key_iph->daddr;
1751         keys->tags.flow_label = ip6_flowinfo(key_iph);
1752         keys->basic.ip_proto = key_iph->nexthdr;
1753 }
1754
1755 /* if skb is set it will be used and fl6 can be NULL */
1756 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1757 {
1758         struct flow_keys hash_keys;
1759
1760         if (skb) {
1761                 ip6_multipath_l3_keys(skb, &hash_keys);
1762                 return flow_hash_from_keys(&hash_keys);
1763         }
1764
1765         return get_hash_from_flowi6(fl6);
1766 }
1767
1768 void ip6_route_input(struct sk_buff *skb)
1769 {
1770         const struct ipv6hdr *iph = ipv6_hdr(skb);
1771         struct net *net = dev_net(skb->dev);
1772         int flags = RT6_LOOKUP_F_HAS_SADDR;
1773         struct ip_tunnel_info *tun_info;
1774         struct flowi6 fl6 = {
1775                 .flowi6_iif = skb->dev->ifindex,
1776                 .daddr = iph->daddr,
1777                 .saddr = iph->saddr,
1778                 .flowlabel = ip6_flowinfo(iph),
1779                 .flowi6_mark = skb->mark,
1780                 .flowi6_proto = iph->nexthdr,
1781         };
1782
1783         tun_info = skb_tunnel_info(skb);
1784         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1785                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1786         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1787                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1788         skb_dst_drop(skb);
1789         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1790 }
1791
1792 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1793                                              struct flowi6 *fl6, int flags)
1794 {
1795         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1796 }
1797
1798 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1799                                          struct flowi6 *fl6, int flags)
1800 {
1801         bool any_src;
1802
1803         if (rt6_need_strict(&fl6->daddr)) {
1804                 struct dst_entry *dst;
1805
1806                 dst = l3mdev_link_scope_lookup(net, fl6);
1807                 if (dst)
1808                         return dst;
1809         }
1810
1811         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1812
1813         any_src = ipv6_addr_any(&fl6->saddr);
1814         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1815             (fl6->flowi6_oif && any_src))
1816                 flags |= RT6_LOOKUP_F_IFACE;
1817
1818         if (!any_src)
1819                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1820         else if (sk)
1821                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1822
1823         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1824 }
1825 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1826
1827 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1828 {
1829         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1830         struct net_device *loopback_dev = net->loopback_dev;
1831         struct dst_entry *new = NULL;
1832
1833         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1834                        DST_OBSOLETE_NONE, 0);
1835         if (rt) {
1836                 rt6_info_init(rt);
1837
1838                 new = &rt->dst;
1839                 new->__use = 1;
1840                 new->input = dst_discard;
1841                 new->output = dst_discard_out;
1842
1843                 dst_copy_metrics(new, &ort->dst);
1844
1845                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1846                 rt->rt6i_gateway = ort->rt6i_gateway;
1847                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1848                 rt->rt6i_metric = 0;
1849
1850                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1851 #ifdef CONFIG_IPV6_SUBTREES
1852                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1853 #endif
1854         }
1855
1856         dst_release(dst_orig);
1857         return new ? new : ERR_PTR(-ENOMEM);
1858 }
1859
1860 /*
1861  *      Destination cache support functions
1862  */
1863
1864 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1865 {
1866         if (rt->dst.from &&
1867             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1868                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1869 }
1870
1871 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1872 {
1873         u32 rt_cookie = 0;
1874
1875         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1876                 return NULL;
1877
1878         if (rt6_check_expired(rt))
1879                 return NULL;
1880
1881         return &rt->dst;
1882 }
1883
1884 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1885 {
1886         if (!__rt6_check_expired(rt) &&
1887             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1888             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1889                 return &rt->dst;
1890         else
1891                 return NULL;
1892 }
1893
1894 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1895 {
1896         struct rt6_info *rt;
1897
1898         rt = (struct rt6_info *) dst;
1899
1900         /* All IPV6 dsts are created with ->obsolete set to the value
1901          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1902          * into this function always.
1903          */
1904
1905         rt6_dst_from_metrics_check(rt);
1906
1907         if (rt->rt6i_flags & RTF_PCPU ||
1908             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1909                 return rt6_dst_from_check(rt, cookie);
1910         else
1911                 return rt6_check(rt, cookie);
1912 }
1913
1914 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1915 {
1916         struct rt6_info *rt = (struct rt6_info *) dst;
1917
1918         if (rt) {
1919                 if (rt->rt6i_flags & RTF_CACHE) {
1920                         if (rt6_check_expired(rt)) {
1921                                 ip6_del_rt(rt);
1922                                 dst = NULL;
1923                         }
1924                 } else {
1925                         dst_release(dst);
1926                         dst = NULL;
1927                 }
1928         }
1929         return dst;
1930 }
1931
1932 static void ip6_link_failure(struct sk_buff *skb)
1933 {
1934         struct rt6_info *rt;
1935
1936         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1937
1938         rt = (struct rt6_info *) skb_dst(skb);
1939         if (rt) {
1940                 if (rt->rt6i_flags & RTF_CACHE) {
1941                         if (dst_hold_safe(&rt->dst))
1942                                 ip6_del_rt(rt);
1943                 } else {
1944                         struct fib6_node *fn;
1945
1946                         rcu_read_lock();
1947                         fn = rcu_dereference(rt->rt6i_node);
1948                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1949                                 fn->fn_sernum = -1;
1950                         rcu_read_unlock();
1951                 }
1952         }
1953 }
1954
1955 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1956 {
1957         struct net *net = dev_net(rt->dst.dev);
1958
1959         rt->rt6i_flags |= RTF_MODIFIED;
1960         rt->rt6i_pmtu = mtu;
1961         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1962 }
1963
1964 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1965 {
1966         return !(rt->rt6i_flags & RTF_CACHE) &&
1967                 (rt->rt6i_flags & RTF_PCPU ||
1968                  rcu_access_pointer(rt->rt6i_node));
1969 }
1970
1971 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1972                                  const struct ipv6hdr *iph, u32 mtu)
1973 {
1974         const struct in6_addr *daddr, *saddr;
1975         struct rt6_info *rt6 = (struct rt6_info *)dst;
1976
1977         if (rt6->rt6i_flags & RTF_LOCAL)
1978                 return;
1979
1980         if (dst_metric_locked(dst, RTAX_MTU))
1981                 return;
1982
1983         if (iph) {
1984                 daddr = &iph->daddr;
1985                 saddr = &iph->saddr;
1986         } else if (sk) {
1987                 daddr = &sk->sk_v6_daddr;
1988                 saddr = &inet6_sk(sk)->saddr;
1989         } else {
1990                 daddr = NULL;
1991                 saddr = NULL;
1992         }
1993         dst_confirm_neigh(dst, daddr);
1994         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1995         if (mtu >= dst_mtu(dst))
1996                 return;
1997
1998         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1999                 rt6_do_update_pmtu(rt6, mtu);
2000                 /* update rt6_ex->stamp for cache */
2001                 if (rt6->rt6i_flags & RTF_CACHE)
2002                         rt6_update_exception_stamp_rt(rt6);
2003         } else if (daddr) {
2004                 struct rt6_info *nrt6;
2005
2006                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2007                 if (nrt6) {
2008                         rt6_do_update_pmtu(nrt6, mtu);
2009                         if (rt6_insert_exception(nrt6, rt6))
2010                                 dst_release_immediate(&nrt6->dst);
2011                 }
2012         }
2013 }
2014
2015 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2016                                struct sk_buff *skb, u32 mtu)
2017 {
2018         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2019 }
2020
2021 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2022                      int oif, u32 mark, kuid_t uid)
2023 {
2024         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2025         struct dst_entry *dst;
2026         struct flowi6 fl6;
2027
2028         memset(&fl6, 0, sizeof(fl6));
2029         fl6.flowi6_oif = oif;
2030         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2031         fl6.daddr = iph->daddr;
2032         fl6.saddr = iph->saddr;
2033         fl6.flowlabel = ip6_flowinfo(iph);
2034         fl6.flowi6_uid = uid;
2035
2036         dst = ip6_route_output(net, NULL, &fl6);
2037         if (!dst->error)
2038                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2039         dst_release(dst);
2040 }
2041 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2042
2043 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2044 {
2045         struct dst_entry *dst;
2046
2047         ip6_update_pmtu(skb, sock_net(sk), mtu,
2048                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2049
2050         dst = __sk_dst_get(sk);
2051         if (!dst || !dst->obsolete ||
2052             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2053                 return;
2054
2055         bh_lock_sock(sk);
2056         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2057                 ip6_datagram_dst_update(sk, false);
2058         bh_unlock_sock(sk);
2059 }
2060 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2061
2062 /* Handle redirects */
2063 struct ip6rd_flowi {
2064         struct flowi6 fl6;
2065         struct in6_addr gateway;
2066 };
2067
2068 static struct rt6_info *__ip6_route_redirect(struct net *net,
2069                                              struct fib6_table *table,
2070                                              struct flowi6 *fl6,
2071                                              int flags)
2072 {
2073         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2074         struct rt6_info *rt, *rt_cache;
2075         struct fib6_node *fn;
2076
2077         /* Get the "current" route for this destination and
2078          * check if the redirect has come from appropriate router.
2079          *
2080          * RFC 4861 specifies that redirects should only be
2081          * accepted if they come from the nexthop to the target.
2082          * Due to the way the routes are chosen, this notion
2083          * is a bit fuzzy and one might need to check all possible
2084          * routes.
2085          */
2086
2087         read_lock_bh(&table->tb6_lock);
2088         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2089 restart:
2090         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2091                 if (rt6_check_expired(rt))
2092                         continue;
2093                 if (rt->dst.error)
2094                         break;
2095                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2096                         continue;
2097                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2098                         continue;
2099                 /* rt_cache's gateway might be different from its 'parent'
2100                  * in the case of an ip redirect.
2101                  * So we keep searching in the exception table if the gateway
2102                  * is different.
2103                  */
2104                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2105                         rt_cache = rt6_find_cached_rt(rt,
2106                                                       &fl6->daddr,
2107                                                       &fl6->saddr);
2108                         if (rt_cache &&
2109                             ipv6_addr_equal(&rdfl->gateway,
2110                                             &rt_cache->rt6i_gateway)) {
2111                                 rt = rt_cache;
2112                                 break;
2113                         }
2114                         continue;
2115                 }
2116                 break;
2117         }
2118
2119         if (!rt)
2120                 rt = net->ipv6.ip6_null_entry;
2121         else if (rt->dst.error) {
2122                 rt = net->ipv6.ip6_null_entry;
2123                 goto out;
2124         }
2125
2126         if (rt == net->ipv6.ip6_null_entry) {
2127                 fn = fib6_backtrack(fn, &fl6->saddr);
2128                 if (fn)
2129                         goto restart;
2130         }
2131
2132 out:
2133         dst_hold(&rt->dst);
2134
2135         read_unlock_bh(&table->tb6_lock);
2136
2137         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2138         return rt;
2139 };
2140
2141 static struct dst_entry *ip6_route_redirect(struct net *net,
2142                                         const struct flowi6 *fl6,
2143                                         const struct in6_addr *gateway)
2144 {
2145         int flags = RT6_LOOKUP_F_HAS_SADDR;
2146         struct ip6rd_flowi rdfl;
2147
2148         rdfl.fl6 = *fl6;
2149         rdfl.gateway = *gateway;
2150
2151         return fib6_rule_lookup(net, &rdfl.fl6,
2152                                 flags, __ip6_route_redirect);
2153 }
2154
2155 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2156                   kuid_t uid)
2157 {
2158         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2159         struct dst_entry *dst;
2160         struct flowi6 fl6;
2161
2162         memset(&fl6, 0, sizeof(fl6));
2163         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2164         fl6.flowi6_oif = oif;
2165         fl6.flowi6_mark = mark;
2166         fl6.daddr = iph->daddr;
2167         fl6.saddr = iph->saddr;
2168         fl6.flowlabel = ip6_flowinfo(iph);
2169         fl6.flowi6_uid = uid;
2170
2171         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2172         rt6_do_redirect(dst, NULL, skb);
2173         dst_release(dst);
2174 }
2175 EXPORT_SYMBOL_GPL(ip6_redirect);
2176
2177 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2178                             u32 mark)
2179 {
2180         const struct ipv6hdr *iph = ipv6_hdr(skb);
2181         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2182         struct dst_entry *dst;
2183         struct flowi6 fl6;
2184
2185         memset(&fl6, 0, sizeof(fl6));
2186         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2187         fl6.flowi6_oif = oif;
2188         fl6.flowi6_mark = mark;
2189         fl6.daddr = msg->dest;
2190         fl6.saddr = iph->daddr;
2191         fl6.flowi6_uid = sock_net_uid(net, NULL);
2192
2193         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2194         rt6_do_redirect(dst, NULL, skb);
2195         dst_release(dst);
2196 }
2197
2198 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2199 {
2200         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2201                      sk->sk_uid);
2202 }
2203 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2204
2205 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2206 {
2207         struct net_device *dev = dst->dev;
2208         unsigned int mtu = dst_mtu(dst);
2209         struct net *net = dev_net(dev);
2210
2211         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2212
2213         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2214                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2215
2216         /*
2217          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2218          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2219          * IPV6_MAXPLEN is also valid and means: "any MSS,
2220          * rely only on pmtu discovery"
2221          */
2222         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2223                 mtu = IPV6_MAXPLEN;
2224         return mtu;
2225 }
2226
2227 static unsigned int ip6_mtu(const struct dst_entry *dst)
2228 {
2229         const struct rt6_info *rt = (const struct rt6_info *)dst;
2230         unsigned int mtu = rt->rt6i_pmtu;
2231         struct inet6_dev *idev;
2232
2233         if (mtu)
2234                 goto out;
2235
2236         mtu = dst_metric_raw(dst, RTAX_MTU);
2237         if (mtu)
2238                 goto out;
2239
2240         mtu = IPV6_MIN_MTU;
2241
2242         rcu_read_lock();
2243         idev = __in6_dev_get(dst->dev);
2244         if (idev)
2245                 mtu = idev->cnf.mtu6;
2246         rcu_read_unlock();
2247
2248 out:
2249         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2250
2251         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2252 }
2253
2254 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2255                                   struct flowi6 *fl6)
2256 {
2257         struct dst_entry *dst;
2258         struct rt6_info *rt;
2259         struct inet6_dev *idev = in6_dev_get(dev);
2260         struct net *net = dev_net(dev);
2261
2262         if (unlikely(!idev))
2263                 return ERR_PTR(-ENODEV);
2264
2265         rt = ip6_dst_alloc(net, dev, 0);
2266         if (unlikely(!rt)) {
2267                 in6_dev_put(idev);
2268                 dst = ERR_PTR(-ENOMEM);
2269                 goto out;
2270         }
2271
2272         rt->dst.flags |= DST_HOST;
2273         rt->dst.output  = ip6_output;
2274         rt->rt6i_gateway  = fl6->daddr;
2275         rt->rt6i_dst.addr = fl6->daddr;
2276         rt->rt6i_dst.plen = 128;
2277         rt->rt6i_idev     = idev;
2278         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2279
2280         /* Add this dst into uncached_list so that rt6_ifdown() can
2281          * do proper release of the net_device
2282          */
2283         rt6_uncached_list_add(rt);
2284
2285         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2286
2287 out:
2288         return dst;
2289 }
2290
2291 static int ip6_dst_gc(struct dst_ops *ops)
2292 {
2293         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2294         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2295         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2296         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2297         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2298         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2299         int entries;
2300
2301         entries = dst_entries_get_fast(ops);
2302         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2303             entries <= rt_max_size)
2304                 goto out;
2305
2306         net->ipv6.ip6_rt_gc_expire++;
2307         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2308         entries = dst_entries_get_slow(ops);
2309         if (entries < ops->gc_thresh)
2310                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2311 out:
2312         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2313         return entries > rt_max_size;
2314 }
2315
2316 static int ip6_convert_metrics(struct mx6_config *mxc,
2317                                const struct fib6_config *cfg)
2318 {
2319         bool ecn_ca = false;
2320         struct nlattr *nla;
2321         int remaining;
2322         u32 *mp;
2323
2324         if (!cfg->fc_mx)
2325                 return 0;
2326
2327         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2328         if (unlikely(!mp))
2329                 return -ENOMEM;
2330
2331         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2332                 int type = nla_type(nla);
2333                 u32 val;
2334
2335                 if (!type)
2336                         continue;
2337                 if (unlikely(type > RTAX_MAX))
2338                         goto err;
2339
2340                 if (type == RTAX_CC_ALGO) {
2341                         char tmp[TCP_CA_NAME_MAX];
2342
2343                         nla_strlcpy(tmp, nla, sizeof(tmp));
2344                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2345                         if (val == TCP_CA_UNSPEC)
2346                                 goto err;
2347                 } else {
2348                         val = nla_get_u32(nla);
2349                 }
2350                 if (type == RTAX_HOPLIMIT && val > 255)
2351                         val = 255;
2352                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2353                         goto err;
2354
2355                 mp[type - 1] = val;
2356                 __set_bit(type - 1, mxc->mx_valid);
2357         }
2358
2359         if (ecn_ca) {
2360                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2361                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2362         }
2363
2364         mxc->mx = mp;
2365         return 0;
2366  err:
2367         kfree(mp);
2368         return -EINVAL;
2369 }
2370
2371 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2372                                             struct fib6_config *cfg,
2373                                             const struct in6_addr *gw_addr)
2374 {
2375         struct flowi6 fl6 = {
2376                 .flowi6_oif = cfg->fc_ifindex,
2377                 .daddr = *gw_addr,
2378                 .saddr = cfg->fc_prefsrc,
2379         };
2380         struct fib6_table *table;
2381         struct rt6_info *rt;
2382         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2383
2384         table = fib6_get_table(net, cfg->fc_table);
2385         if (!table)
2386                 return NULL;
2387
2388         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2389                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2390
2391         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2392
2393         /* if table lookup failed, fall back to full lookup */
2394         if (rt == net->ipv6.ip6_null_entry) {
2395                 ip6_rt_put(rt);
2396                 rt = NULL;
2397         }
2398
2399         return rt;
2400 }
2401
2402 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2403                                               struct netlink_ext_ack *extack)
2404 {
2405         struct net *net = cfg->fc_nlinfo.nl_net;
2406         struct rt6_info *rt = NULL;
2407         struct net_device *dev = NULL;
2408         struct inet6_dev *idev = NULL;
2409         struct fib6_table *table;
2410         int addr_type;
2411         int err = -EINVAL;
2412
2413         /* RTF_PCPU is an internal flag; can not be set by userspace */
2414         if (cfg->fc_flags & RTF_PCPU) {
2415                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2416                 goto out;
2417         }
2418
2419         if (cfg->fc_dst_len > 128) {
2420                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2421                 goto out;
2422         }
2423         if (cfg->fc_src_len > 128) {
2424                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2425                 goto out;
2426         }
2427 #ifndef CONFIG_IPV6_SUBTREES
2428         if (cfg->fc_src_len) {
2429                 NL_SET_ERR_MSG(extack,
2430                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2431                 goto out;
2432         }
2433 #endif
2434         if (cfg->fc_ifindex) {
2435                 err = -ENODEV;
2436                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2437                 if (!dev)
2438                         goto out;
2439                 idev = in6_dev_get(dev);
2440                 if (!idev)
2441                         goto out;
2442         }
2443
2444         if (cfg->fc_metric == 0)
2445                 cfg->fc_metric = IP6_RT_PRIO_USER;
2446
2447         err = -ENOBUFS;
2448         if (cfg->fc_nlinfo.nlh &&
2449             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2450                 table = fib6_get_table(net, cfg->fc_table);
2451                 if (!table) {
2452                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2453                         table = fib6_new_table(net, cfg->fc_table);
2454                 }
2455         } else {
2456                 table = fib6_new_table(net, cfg->fc_table);
2457         }
2458
2459         if (!table)
2460                 goto out;
2461
2462         rt = ip6_dst_alloc(net, NULL,
2463                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2464
2465         if (!rt) {
2466                 err = -ENOMEM;
2467                 goto out;
2468         }
2469
2470         if (cfg->fc_flags & RTF_EXPIRES)
2471                 rt6_set_expires(rt, jiffies +
2472                                 clock_t_to_jiffies(cfg->fc_expires));
2473         else
2474                 rt6_clean_expires(rt);
2475
2476         if (cfg->fc_protocol == RTPROT_UNSPEC)
2477                 cfg->fc_protocol = RTPROT_BOOT;
2478         rt->rt6i_protocol = cfg->fc_protocol;
2479
2480         addr_type = ipv6_addr_type(&cfg->fc_dst);
2481
2482         if (addr_type & IPV6_ADDR_MULTICAST)
2483                 rt->dst.input = ip6_mc_input;
2484         else if (cfg->fc_flags & RTF_LOCAL)
2485                 rt->dst.input = ip6_input;
2486         else
2487                 rt->dst.input = ip6_forward;
2488
2489         rt->dst.output = ip6_output;
2490
2491         if (cfg->fc_encap) {
2492                 struct lwtunnel_state *lwtstate;
2493
2494                 err = lwtunnel_build_state(cfg->fc_encap_type,
2495                                            cfg->fc_encap, AF_INET6, cfg,
2496                                            &lwtstate, extack);
2497                 if (err)
2498                         goto out;
2499                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2500                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2501                         rt->dst.lwtstate->orig_output = rt->dst.output;
2502                         rt->dst.output = lwtunnel_output;
2503                 }
2504                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2505                         rt->dst.lwtstate->orig_input = rt->dst.input;
2506                         rt->dst.input = lwtunnel_input;
2507                 }
2508         }
2509
2510         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2511         rt->rt6i_dst.plen = cfg->fc_dst_len;
2512         if (rt->rt6i_dst.plen == 128)
2513                 rt->dst.flags |= DST_HOST;
2514
2515 #ifdef CONFIG_IPV6_SUBTREES
2516         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2517         rt->rt6i_src.plen = cfg->fc_src_len;
2518 #endif
2519
2520         rt->rt6i_metric = cfg->fc_metric;
2521
2522         /* We cannot add true routes via loopback here,
2523            they would result in kernel looping; promote them to reject routes
2524          */
2525         if ((cfg->fc_flags & RTF_REJECT) ||
2526             (dev && (dev->flags & IFF_LOOPBACK) &&
2527              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2528              !(cfg->fc_flags & RTF_LOCAL))) {
2529                 /* hold loopback dev/idev if we haven't done so. */
2530                 if (dev != net->loopback_dev) {
2531                         if (dev) {
2532                                 dev_put(dev);
2533                                 in6_dev_put(idev);
2534                         }
2535                         dev = net->loopback_dev;
2536                         dev_hold(dev);
2537                         idev = in6_dev_get(dev);
2538                         if (!idev) {
2539                                 err = -ENODEV;
2540                                 goto out;
2541                         }
2542                 }
2543                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2544                 switch (cfg->fc_type) {
2545                 case RTN_BLACKHOLE:
2546                         rt->dst.error = -EINVAL;
2547                         rt->dst.output = dst_discard_out;
2548                         rt->dst.input = dst_discard;
2549                         break;
2550                 case RTN_PROHIBIT:
2551                         rt->dst.error = -EACCES;
2552                         rt->dst.output = ip6_pkt_prohibit_out;
2553                         rt->dst.input = ip6_pkt_prohibit;
2554                         break;
2555                 case RTN_THROW:
2556                 case RTN_UNREACHABLE:
2557                 default:
2558                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2559                                         : (cfg->fc_type == RTN_UNREACHABLE)
2560                                         ? -EHOSTUNREACH : -ENETUNREACH;
2561                         rt->dst.output = ip6_pkt_discard_out;
2562                         rt->dst.input = ip6_pkt_discard;
2563                         break;
2564                 }
2565                 goto install_route;
2566         }
2567
2568         if (cfg->fc_flags & RTF_GATEWAY) {
2569                 const struct in6_addr *gw_addr;
2570                 int gwa_type;
2571
2572                 gw_addr = &cfg->fc_gateway;
2573                 gwa_type = ipv6_addr_type(gw_addr);
2574
2575                 /* if gw_addr is local we will fail to detect this in case
2576                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2577                  * will return already-added prefix route via interface that
2578                  * prefix route was assigned to, which might be non-loopback.
2579                  */
2580                 err = -EINVAL;
2581                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2582                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2583                                             dev : NULL, 0, 0)) {
2584                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2585                         goto out;
2586                 }
2587                 rt->rt6i_gateway = *gw_addr;
2588
2589                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2590                         struct rt6_info *grt = NULL;
2591
2592                         /* IPv6 strictly inhibits using not link-local
2593                            addresses as nexthop address.
2594                            Otherwise, router will not able to send redirects.
2595                            It is very good, but in some (rare!) circumstances
2596                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2597                            some exceptions. --ANK
2598                            We allow IPv4-mapped nexthops to support RFC4798-type
2599                            addressing
2600                          */
2601                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2602                                           IPV6_ADDR_MAPPED))) {
2603                                 NL_SET_ERR_MSG(extack,
2604                                                "Invalid gateway address");
2605                                 goto out;
2606                         }
2607
2608                         if (cfg->fc_table) {
2609                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2610
2611                                 if (grt) {
2612                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2613                                             (dev && dev != grt->dst.dev)) {
2614                                                 ip6_rt_put(grt);
2615                                                 grt = NULL;
2616                                         }
2617                                 }
2618                         }
2619
2620                         if (!grt)
2621                                 grt = rt6_lookup(net, gw_addr, NULL,
2622                                                  cfg->fc_ifindex, 1);
2623
2624                         err = -EHOSTUNREACH;
2625                         if (!grt)
2626                                 goto out;
2627                         if (dev) {
2628                                 if (dev != grt->dst.dev) {
2629                                         ip6_rt_put(grt);
2630                                         goto out;
2631                                 }
2632                         } else {
2633                                 dev = grt->dst.dev;
2634                                 idev = grt->rt6i_idev;
2635                                 dev_hold(dev);
2636                                 in6_dev_hold(grt->rt6i_idev);
2637                         }
2638                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2639                                 err = 0;
2640                         ip6_rt_put(grt);
2641
2642                         if (err)
2643                                 goto out;
2644                 }
2645                 err = -EINVAL;
2646                 if (!dev) {
2647                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2648                         goto out;
2649                 } else if (dev->flags & IFF_LOOPBACK) {
2650                         NL_SET_ERR_MSG(extack,
2651                                        "Egress device can not be loopback device for this route");
2652                         goto out;
2653                 }
2654         }
2655
2656         err = -ENODEV;
2657         if (!dev)
2658                 goto out;
2659
2660         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2661                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2662                         NL_SET_ERR_MSG(extack, "Invalid source address");
2663                         err = -EINVAL;
2664                         goto out;
2665                 }
2666                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2667                 rt->rt6i_prefsrc.plen = 128;
2668         } else
2669                 rt->rt6i_prefsrc.plen = 0;
2670
2671         rt->rt6i_flags = cfg->fc_flags;
2672
2673 install_route:
2674         rt->dst.dev = dev;
2675         rt->rt6i_idev = idev;
2676         rt->rt6i_table = table;
2677
2678         cfg->fc_nlinfo.nl_net = dev_net(dev);
2679
2680         return rt;
2681 out:
2682         if (dev)
2683                 dev_put(dev);
2684         if (idev)
2685                 in6_dev_put(idev);
2686         if (rt)
2687                 dst_release_immediate(&rt->dst);
2688
2689         return ERR_PTR(err);
2690 }
2691
2692 int ip6_route_add(struct fib6_config *cfg,
2693                   struct netlink_ext_ack *extack)
2694 {
2695         struct mx6_config mxc = { .mx = NULL, };
2696         struct rt6_info *rt;
2697         int err;
2698
2699         rt = ip6_route_info_create(cfg, extack);
2700         if (IS_ERR(rt)) {
2701                 err = PTR_ERR(rt);
2702                 rt = NULL;
2703                 goto out;
2704         }
2705
2706         err = ip6_convert_metrics(&mxc, cfg);
2707         if (err)
2708                 goto out;
2709
2710         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2711
2712         kfree(mxc.mx);
2713
2714         return err;
2715 out:
2716         if (rt)
2717                 dst_release_immediate(&rt->dst);
2718
2719         return err;
2720 }
2721
2722 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2723 {
2724         int err;
2725         struct fib6_table *table;
2726         struct net *net = dev_net(rt->dst.dev);
2727
2728         if (rt == net->ipv6.ip6_null_entry) {
2729                 err = -ENOENT;
2730                 goto out;
2731         }
2732
2733         table = rt->rt6i_table;
2734         write_lock_bh(&table->tb6_lock);
2735         err = fib6_del(rt, info);
2736         write_unlock_bh(&table->tb6_lock);
2737
2738 out:
2739         ip6_rt_put(rt);
2740         return err;
2741 }
2742
2743 int ip6_del_rt(struct rt6_info *rt)
2744 {
2745         struct nl_info info = {
2746                 .nl_net = dev_net(rt->dst.dev),
2747         };
2748         return __ip6_del_rt(rt, &info);
2749 }
2750
2751 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2752 {
2753         struct nl_info *info = &cfg->fc_nlinfo;
2754         struct net *net = info->nl_net;
2755         struct sk_buff *skb = NULL;
2756         struct fib6_table *table;
2757         int err = -ENOENT;
2758
2759         if (rt == net->ipv6.ip6_null_entry)
2760                 goto out_put;
2761         table = rt->rt6i_table;
2762         write_lock_bh(&table->tb6_lock);
2763
2764         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2765                 struct rt6_info *sibling, *next_sibling;
2766
2767                 /* prefer to send a single notification with all hops */
2768                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2769                 if (skb) {
2770                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2771
2772                         if (rt6_fill_node(net, skb, rt,
2773                                           NULL, NULL, 0, RTM_DELROUTE,
2774                                           info->portid, seq, 0) < 0) {
2775                                 kfree_skb(skb);
2776                                 skb = NULL;
2777                         } else
2778                                 info->skip_notify = 1;
2779                 }
2780
2781                 list_for_each_entry_safe(sibling, next_sibling,
2782                                          &rt->rt6i_siblings,
2783                                          rt6i_siblings) {
2784                         err = fib6_del(sibling, info);
2785                         if (err)
2786                                 goto out_unlock;
2787                 }
2788         }
2789
2790         err = fib6_del(rt, info);
2791 out_unlock:
2792         write_unlock_bh(&table->tb6_lock);
2793 out_put:
2794         ip6_rt_put(rt);
2795
2796         if (skb) {
2797                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2798                             info->nlh, gfp_any());
2799         }
2800         return err;
2801 }
2802
2803 static int ip6_route_del(struct fib6_config *cfg,
2804                          struct netlink_ext_ack *extack)
2805 {
2806         struct rt6_info *rt, *rt_cache;
2807         struct fib6_table *table;
2808         struct fib6_node *fn;
2809         int err = -ESRCH;
2810
2811         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2812         if (!table) {
2813                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2814                 return err;
2815         }
2816
2817         read_lock_bh(&table->tb6_lock);
2818
2819         fn = fib6_locate(&table->tb6_root,
2820                          &cfg->fc_dst, cfg->fc_dst_len,
2821                          &cfg->fc_src, cfg->fc_src_len,
2822                          !(cfg->fc_flags & RTF_CACHE));
2823
2824         if (fn) {
2825                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2826                         if (cfg->fc_flags & RTF_CACHE) {
2827                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2828                                                               &cfg->fc_src);
2829                                 if (!rt_cache)
2830                                         continue;
2831                                 rt = rt_cache;
2832                         }
2833                         if (cfg->fc_ifindex &&
2834                             (!rt->dst.dev ||
2835                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2836                                 continue;
2837                         if (cfg->fc_flags & RTF_GATEWAY &&
2838                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2839                                 continue;
2840                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2841                                 continue;
2842                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2843                                 continue;
2844                         dst_hold(&rt->dst);
2845                         read_unlock_bh(&table->tb6_lock);
2846
2847                         /* if gateway was specified only delete the one hop */
2848                         if (cfg->fc_flags & RTF_GATEWAY)
2849                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2850
2851                         return __ip6_del_rt_siblings(rt, cfg);
2852                 }
2853         }
2854         read_unlock_bh(&table->tb6_lock);
2855
2856         return err;
2857 }
2858
2859 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2860 {
2861         struct netevent_redirect netevent;
2862         struct rt6_info *rt, *nrt = NULL;
2863         struct ndisc_options ndopts;
2864         struct inet6_dev *in6_dev;
2865         struct neighbour *neigh;
2866         struct rd_msg *msg;
2867         int optlen, on_link;
2868         u8 *lladdr;
2869
2870         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2871         optlen -= sizeof(*msg);
2872
2873         if (optlen < 0) {
2874                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2875                 return;
2876         }
2877
2878         msg = (struct rd_msg *)icmp6_hdr(skb);
2879
2880         if (ipv6_addr_is_multicast(&msg->dest)) {
2881                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2882                 return;
2883         }
2884
2885         on_link = 0;
2886         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2887                 on_link = 1;
2888         } else if (ipv6_addr_type(&msg->target) !=
2889                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2890                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2891                 return;
2892         }
2893
2894         in6_dev = __in6_dev_get(skb->dev);
2895         if (!in6_dev)
2896                 return;
2897         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2898                 return;
2899
2900         /* RFC2461 8.1:
2901          *      The IP source address of the Redirect MUST be the same as the current
2902          *      first-hop router for the specified ICMP Destination Address.
2903          */
2904
2905         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2906                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2907                 return;
2908         }
2909
2910         lladdr = NULL;
2911         if (ndopts.nd_opts_tgt_lladdr) {
2912                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2913                                              skb->dev);
2914                 if (!lladdr) {
2915                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2916                         return;
2917                 }
2918         }
2919
2920         rt = (struct rt6_info *) dst;
2921         if (rt->rt6i_flags & RTF_REJECT) {
2922                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2923                 return;
2924         }
2925
2926         /* Redirect received -> path was valid.
2927          * Look, redirects are sent only in response to data packets,
2928          * so that this nexthop apparently is reachable. --ANK
2929          */
2930         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2931
2932         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2933         if (!neigh)
2934                 return;
2935
2936         /*
2937          *      We have finally decided to accept it.
2938          */
2939
2940         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2941                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2942                      NEIGH_UPDATE_F_OVERRIDE|
2943                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2944                                      NEIGH_UPDATE_F_ISROUTER)),
2945                      NDISC_REDIRECT, &ndopts);
2946
2947         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2948         if (!nrt)
2949                 goto out;
2950
2951         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2952         if (on_link)
2953                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2954
2955         nrt->rt6i_protocol = RTPROT_REDIRECT;
2956         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2957
2958         /* No need to remove rt from the exception table if rt is
2959          * a cached route because rt6_insert_exception() will
2960          * takes care of it
2961          */
2962         if (rt6_insert_exception(nrt, rt)) {
2963                 dst_release_immediate(&nrt->dst);
2964                 goto out;
2965         }
2966
2967         netevent.old = &rt->dst;
2968         netevent.new = &nrt->dst;
2969         netevent.daddr = &msg->dest;
2970         netevent.neigh = neigh;
2971         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2972
2973 out:
2974         neigh_release(neigh);
2975 }
2976
2977 /*
2978  *      Misc support functions
2979  */
2980
2981 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2982 {
2983         BUG_ON(from->dst.from);
2984
2985         rt->rt6i_flags &= ~RTF_EXPIRES;
2986         dst_hold(&from->dst);
2987         rt->dst.from = &from->dst;
2988         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2989 }
2990
2991 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2992 {
2993         rt->dst.input = ort->dst.input;
2994         rt->dst.output = ort->dst.output;
2995         rt->rt6i_dst = ort->rt6i_dst;
2996         rt->dst.error = ort->dst.error;
2997         rt->rt6i_idev = ort->rt6i_idev;
2998         if (rt->rt6i_idev)
2999                 in6_dev_hold(rt->rt6i_idev);
3000         rt->dst.lastuse = jiffies;
3001         rt->rt6i_gateway = ort->rt6i_gateway;
3002         rt->rt6i_flags = ort->rt6i_flags;
3003         rt6_set_from(rt, ort);
3004         rt->rt6i_metric = ort->rt6i_metric;
3005 #ifdef CONFIG_IPV6_SUBTREES
3006         rt->rt6i_src = ort->rt6i_src;
3007 #endif
3008         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3009         rt->rt6i_table = ort->rt6i_table;
3010         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3011 }
3012
3013 #ifdef CONFIG_IPV6_ROUTE_INFO
3014 static struct rt6_info *rt6_get_route_info(struct net *net,
3015                                            const struct in6_addr *prefix, int prefixlen,
3016                                            const struct in6_addr *gwaddr,
3017                                            struct net_device *dev)
3018 {
3019         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3020         int ifindex = dev->ifindex;
3021         struct fib6_node *fn;
3022         struct rt6_info *rt = NULL;
3023         struct fib6_table *table;
3024
3025         table = fib6_get_table(net, tb_id);
3026         if (!table)
3027                 return NULL;
3028
3029         read_lock_bh(&table->tb6_lock);
3030         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3031         if (!fn)
3032                 goto out;
3033
3034         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3035                 if (rt->dst.dev->ifindex != ifindex)
3036                         continue;
3037                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3038                         continue;
3039                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3040                         continue;
3041                 dst_hold(&rt->dst);
3042                 break;
3043         }
3044 out:
3045         read_unlock_bh(&table->tb6_lock);
3046         return rt;
3047 }
3048
3049 static struct rt6_info *rt6_add_route_info(struct net *net,
3050                                            const struct in6_addr *prefix, int prefixlen,
3051                                            const struct in6_addr *gwaddr,
3052                                            struct net_device *dev,
3053                                            unsigned int pref)
3054 {
3055         struct fib6_config cfg = {
3056                 .fc_metric      = IP6_RT_PRIO_USER,
3057                 .fc_ifindex     = dev->ifindex,
3058                 .fc_dst_len     = prefixlen,
3059                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3060                                   RTF_UP | RTF_PREF(pref),
3061                 .fc_protocol = RTPROT_RA,
3062                 .fc_nlinfo.portid = 0,
3063                 .fc_nlinfo.nlh = NULL,
3064                 .fc_nlinfo.nl_net = net,
3065         };
3066
3067         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3068         cfg.fc_dst = *prefix;
3069         cfg.fc_gateway = *gwaddr;
3070
3071         /* We should treat it as a default route if prefix length is 0. */
3072         if (!prefixlen)
3073                 cfg.fc_flags |= RTF_DEFAULT;
3074
3075         ip6_route_add(&cfg, NULL);
3076
3077         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3078 }
3079 #endif
3080
3081 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3082 {
3083         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3084         struct rt6_info *rt;
3085         struct fib6_table *table;
3086
3087         table = fib6_get_table(dev_net(dev), tb_id);
3088         if (!table)
3089                 return NULL;
3090
3091         read_lock_bh(&table->tb6_lock);
3092         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3093                 if (dev == rt->dst.dev &&
3094                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3095                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3096                         break;
3097         }
3098         if (rt)
3099                 dst_hold(&rt->dst);
3100         read_unlock_bh(&table->tb6_lock);
3101         return rt;
3102 }
3103
3104 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3105                                      struct net_device *dev,
3106                                      unsigned int pref)
3107 {
3108         struct fib6_config cfg = {
3109                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3110                 .fc_metric      = IP6_RT_PRIO_USER,
3111                 .fc_ifindex     = dev->ifindex,
3112                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3113                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3114                 .fc_protocol = RTPROT_RA,
3115                 .fc_nlinfo.portid = 0,
3116                 .fc_nlinfo.nlh = NULL,
3117                 .fc_nlinfo.nl_net = dev_net(dev),
3118         };
3119
3120         cfg.fc_gateway = *gwaddr;
3121
3122         if (!ip6_route_add(&cfg, NULL)) {
3123                 struct fib6_table *table;
3124
3125                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3126                 if (table)
3127                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3128         }
3129
3130         return rt6_get_dflt_router(gwaddr, dev);
3131 }
3132
3133 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3134 {
3135         struct rt6_info *rt;
3136
3137 restart:
3138         read_lock_bh(&table->tb6_lock);
3139         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3140                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3141                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3142                         dst_hold(&rt->dst);
3143                         read_unlock_bh(&table->tb6_lock);
3144                         ip6_del_rt(rt);
3145                         goto restart;
3146                 }
3147         }
3148         read_unlock_bh(&table->tb6_lock);
3149
3150         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3151 }
3152
3153 void rt6_purge_dflt_routers(struct net *net)
3154 {
3155         struct fib6_table *table;
3156         struct hlist_head *head;
3157         unsigned int h;
3158
3159         rcu_read_lock();
3160
3161         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3162                 head = &net->ipv6.fib_table_hash[h];
3163                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3164                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3165                                 __rt6_purge_dflt_routers(table);
3166                 }
3167         }
3168
3169         rcu_read_unlock();
3170 }
3171
3172 static void rtmsg_to_fib6_config(struct net *net,
3173                                  struct in6_rtmsg *rtmsg,
3174                                  struct fib6_config *cfg)
3175 {
3176         memset(cfg, 0, sizeof(*cfg));
3177
3178         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3179                          : RT6_TABLE_MAIN;
3180         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3181         cfg->fc_metric = rtmsg->rtmsg_metric;
3182         cfg->fc_expires = rtmsg->rtmsg_info;
3183         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3184         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3185         cfg->fc_flags = rtmsg->rtmsg_flags;
3186
3187         cfg->fc_nlinfo.nl_net = net;
3188
3189         cfg->fc_dst = rtmsg->rtmsg_dst;
3190         cfg->fc_src = rtmsg->rtmsg_src;
3191         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3192 }
3193
3194 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3195 {
3196         struct fib6_config cfg;
3197         struct in6_rtmsg rtmsg;
3198         int err;
3199
3200         switch (cmd) {
3201         case SIOCADDRT:         /* Add a route */
3202         case SIOCDELRT:         /* Delete a route */
3203                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3204                         return -EPERM;
3205                 err = copy_from_user(&rtmsg, arg,
3206                                      sizeof(struct in6_rtmsg));
3207                 if (err)
3208                         return -EFAULT;
3209
3210                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3211
3212                 rtnl_lock();
3213                 switch (cmd) {
3214                 case SIOCADDRT:
3215                         err = ip6_route_add(&cfg, NULL);
3216                         break;
3217                 case SIOCDELRT:
3218                         err = ip6_route_del(&cfg, NULL);
3219                         break;
3220                 default:
3221                         err = -EINVAL;
3222                 }
3223                 rtnl_unlock();
3224
3225                 return err;
3226         }
3227
3228         return -EINVAL;
3229 }
3230
3231 /*
3232  *      Drop the packet on the floor
3233  */
3234
3235 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3236 {
3237         int type;
3238         struct dst_entry *dst = skb_dst(skb);
3239         switch (ipstats_mib_noroutes) {
3240         case IPSTATS_MIB_INNOROUTES:
3241                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3242                 if (type == IPV6_ADDR_ANY) {
3243                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3244                                       IPSTATS_MIB_INADDRERRORS);
3245                         break;
3246                 }
3247                 /* FALLTHROUGH */
3248         case IPSTATS_MIB_OUTNOROUTES:
3249                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3250                               ipstats_mib_noroutes);
3251                 break;
3252         }
3253         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3254         kfree_skb(skb);
3255         return 0;
3256 }
3257
3258 static int ip6_pkt_discard(struct sk_buff *skb)
3259 {
3260         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3261 }
3262
3263 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3264 {
3265         skb->dev = skb_dst(skb)->dev;
3266         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3267 }
3268
3269 static int ip6_pkt_prohibit(struct sk_buff *skb)
3270 {
3271         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3272 }
3273
3274 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3275 {
3276         skb->dev = skb_dst(skb)->dev;
3277         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3278 }
3279
3280 /*
3281  *      Allocate a dst for local (unicast / anycast) address.
3282  */
3283
3284 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3285                                     const struct in6_addr *addr,
3286                                     bool anycast)
3287 {
3288         u32 tb_id;
3289         struct net *net = dev_net(idev->dev);
3290         struct net_device *dev = idev->dev;
3291         struct rt6_info *rt;
3292
3293         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3294         if (!rt)
3295                 return ERR_PTR(-ENOMEM);
3296
3297         in6_dev_hold(idev);
3298
3299         rt->dst.flags |= DST_HOST;
3300         rt->dst.input = ip6_input;
3301         rt->dst.output = ip6_output;
3302         rt->rt6i_idev = idev;
3303
3304         rt->rt6i_protocol = RTPROT_KERNEL;
3305         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3306         if (anycast)
3307                 rt->rt6i_flags |= RTF_ANYCAST;
3308         else
3309                 rt->rt6i_flags |= RTF_LOCAL;
3310
3311         rt->rt6i_gateway  = *addr;
3312         rt->rt6i_dst.addr = *addr;
3313         rt->rt6i_dst.plen = 128;
3314         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3315         rt->rt6i_table = fib6_get_table(net, tb_id);
3316
3317         return rt;
3318 }
3319
3320 /* remove deleted ip from prefsrc entries */
3321 struct arg_dev_net_ip {
3322         struct net_device *dev;
3323         struct net *net;
3324         struct in6_addr *addr;
3325 };
3326
3327 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3328 {
3329         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3330         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3331         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3332
3333         if (((void *)rt->dst.dev == dev || !dev) &&
3334             rt != net->ipv6.ip6_null_entry &&
3335             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3336                 spin_lock_bh(&rt6_exception_lock);
3337                 /* remove prefsrc entry */
3338                 rt->rt6i_prefsrc.plen = 0;
3339                 /* need to update cache as well */
3340                 rt6_exceptions_remove_prefsrc(rt);
3341                 spin_unlock_bh(&rt6_exception_lock);
3342         }
3343         return 0;
3344 }
3345
3346 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3347 {
3348         struct net *net = dev_net(ifp->idev->dev);
3349         struct arg_dev_net_ip adni = {
3350                 .dev = ifp->idev->dev,
3351                 .net = net,
3352                 .addr = &ifp->addr,
3353         };
3354         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3355 }
3356
3357 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3358
3359 /* Remove routers and update dst entries when gateway turn into host. */
3360 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3361 {
3362         struct in6_addr *gateway = (struct in6_addr *)arg;
3363
3364         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3365             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3366                 return -1;
3367         }
3368
3369         /* Further clean up cached routes in exception table.
3370          * This is needed because cached route may have a different
3371          * gateway than its 'parent' in the case of an ip redirect.
3372          */
3373         rt6_exceptions_clean_tohost(rt, gateway);
3374
3375         return 0;
3376 }
3377
3378 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3379 {
3380         fib6_clean_all(net, fib6_clean_tohost, gateway);
3381 }
3382
3383 struct arg_dev_net {
3384         struct net_device *dev;
3385         struct net *net;
3386 };
3387
3388 /* called with write lock held for table with rt */
3389 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3390 {
3391         const struct arg_dev_net *adn = arg;
3392         const struct net_device *dev = adn->dev;
3393
3394         if ((rt->dst.dev == dev || !dev) &&
3395             rt != adn->net->ipv6.ip6_null_entry &&
3396             (rt->rt6i_nsiblings == 0 ||
3397              (dev && netdev_unregistering(dev)) ||
3398              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3399                 return -1;
3400
3401         return 0;
3402 }
3403
3404 void rt6_ifdown(struct net *net, struct net_device *dev)
3405 {
3406         struct arg_dev_net adn = {
3407                 .dev = dev,
3408                 .net = net,
3409         };
3410
3411         fib6_clean_all(net, fib6_ifdown, &adn);
3412         if (dev)
3413                 rt6_uncached_list_flush_dev(net, dev);
3414 }
3415
3416 struct rt6_mtu_change_arg {
3417         struct net_device *dev;
3418         unsigned int mtu;
3419 };
3420
3421 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3422 {
3423         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3424         struct inet6_dev *idev;
3425
3426         /* In IPv6 pmtu discovery is not optional,
3427            so that RTAX_MTU lock cannot disable it.
3428            We still use this lock to block changes
3429            caused by addrconf/ndisc.
3430         */
3431
3432         idev = __in6_dev_get(arg->dev);
3433         if (!idev)
3434                 return 0;
3435
3436         /* For administrative MTU increase, there is no way to discover
3437            IPv6 PMTU increase, so PMTU increase should be updated here.
3438            Since RFC 1981 doesn't include administrative MTU increase
3439            update PMTU increase is a MUST. (i.e. jumbo frame)
3440          */
3441         /*
3442            If new MTU is less than route PMTU, this new MTU will be the
3443            lowest MTU in the path, update the route PMTU to reflect PMTU
3444            decreases; if new MTU is greater than route PMTU, and the
3445            old MTU is the lowest MTU in the path, update the route PMTU
3446            to reflect the increase. In this case if the other nodes' MTU
3447            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3448            PMTU discovery.
3449          */
3450         if (rt->dst.dev == arg->dev &&
3451             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3452             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3453                 spin_lock_bh(&rt6_exception_lock);
3454                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3455                     (dst_mtu(&rt->dst) < arg->mtu &&
3456                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3457                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3458                 }
3459                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3460                 spin_unlock_bh(&rt6_exception_lock);
3461         }
3462         return 0;
3463 }
3464
3465 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3466 {
3467         struct rt6_mtu_change_arg arg = {
3468                 .dev = dev,
3469                 .mtu = mtu,
3470         };
3471
3472         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3473 }
3474
3475 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3476         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3477         [RTA_OIF]               = { .type = NLA_U32 },
3478         [RTA_IIF]               = { .type = NLA_U32 },
3479         [RTA_PRIORITY]          = { .type = NLA_U32 },
3480         [RTA_METRICS]           = { .type = NLA_NESTED },
3481         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3482         [RTA_PREF]              = { .type = NLA_U8 },
3483         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3484         [RTA_ENCAP]             = { .type = NLA_NESTED },
3485         [RTA_EXPIRES]           = { .type = NLA_U32 },
3486         [RTA_UID]               = { .type = NLA_U32 },
3487         [RTA_MARK]              = { .type = NLA_U32 },
3488 };
3489
3490 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3491                               struct fib6_config *cfg,
3492                               struct netlink_ext_ack *extack)
3493 {
3494         struct rtmsg *rtm;
3495         struct nlattr *tb[RTA_MAX+1];
3496         unsigned int pref;
3497         int err;
3498
3499         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3500                           NULL);
3501         if (err < 0)
3502                 goto errout;
3503
3504         err = -EINVAL;
3505         rtm = nlmsg_data(nlh);
3506         memset(cfg, 0, sizeof(*cfg));
3507
3508         cfg->fc_table = rtm->rtm_table;
3509         cfg->fc_dst_len = rtm->rtm_dst_len;
3510         cfg->fc_src_len = rtm->rtm_src_len;
3511         cfg->fc_flags = RTF_UP;
3512         cfg->fc_protocol = rtm->rtm_protocol;
3513         cfg->fc_type = rtm->rtm_type;
3514
3515         if (rtm->rtm_type == RTN_UNREACHABLE ||
3516             rtm->rtm_type == RTN_BLACKHOLE ||
3517             rtm->rtm_type == RTN_PROHIBIT ||
3518             rtm->rtm_type == RTN_THROW)
3519                 cfg->fc_flags |= RTF_REJECT;
3520
3521         if (rtm->rtm_type == RTN_LOCAL)
3522                 cfg->fc_flags |= RTF_LOCAL;
3523
3524         if (rtm->rtm_flags & RTM_F_CLONED)
3525                 cfg->fc_flags |= RTF_CACHE;
3526
3527         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3528         cfg->fc_nlinfo.nlh = nlh;
3529         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3530
3531         if (tb[RTA_GATEWAY]) {
3532                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3533                 cfg->fc_flags |= RTF_GATEWAY;
3534         }
3535
3536         if (tb[RTA_DST]) {
3537                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3538
3539                 if (nla_len(tb[RTA_DST]) < plen)
3540                         goto errout;
3541
3542                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3543         }
3544
3545         if (tb[RTA_SRC]) {
3546                 int plen = (rtm->rtm_src_len + 7) >> 3;
3547
3548                 if (nla_len(tb[RTA_SRC]) < plen)
3549                         goto errout;
3550
3551                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3552         }
3553
3554         if (tb[RTA_PREFSRC])
3555                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3556
3557         if (tb[RTA_OIF])
3558                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3559
3560         if (tb[RTA_PRIORITY])
3561                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3562
3563         if (tb[RTA_METRICS]) {
3564                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3565                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3566         }
3567
3568         if (tb[RTA_TABLE])
3569                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3570
3571         if (tb[RTA_MULTIPATH]) {
3572                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3573                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3574
3575                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3576                                                      cfg->fc_mp_len, extack);
3577                 if (err < 0)
3578                         goto errout;
3579         }
3580
3581         if (tb[RTA_PREF]) {
3582                 pref = nla_get_u8(tb[RTA_PREF]);
3583                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3584                     pref != ICMPV6_ROUTER_PREF_HIGH)
3585                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3586                 cfg->fc_flags |= RTF_PREF(pref);
3587         }
3588
3589         if (tb[RTA_ENCAP])
3590                 cfg->fc_encap = tb[RTA_ENCAP];
3591
3592         if (tb[RTA_ENCAP_TYPE]) {
3593                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3594
3595                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3596                 if (err < 0)
3597                         goto errout;
3598         }
3599
3600         if (tb[RTA_EXPIRES]) {
3601                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3602
3603                 if (addrconf_finite_timeout(timeout)) {
3604                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3605                         cfg->fc_flags |= RTF_EXPIRES;
3606                 }
3607         }
3608
3609         err = 0;
3610 errout:
3611         return err;
3612 }
3613
3614 struct rt6_nh {
3615         struct rt6_info *rt6_info;
3616         struct fib6_config r_cfg;
3617         struct mx6_config mxc;
3618         struct list_head next;
3619 };
3620
3621 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3622 {
3623         struct rt6_nh *nh;
3624
3625         list_for_each_entry(nh, rt6_nh_list, next) {
3626                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3627                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3628                         nh->r_cfg.fc_ifindex);
3629         }
3630 }
3631
3632 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3633                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3634 {
3635         struct rt6_nh *nh;
3636         int err = -EEXIST;
3637
3638         list_for_each_entry(nh, rt6_nh_list, next) {
3639                 /* check if rt6_info already exists */
3640                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3641                         return err;
3642         }
3643
3644         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3645         if (!nh)
3646                 return -ENOMEM;
3647         nh->rt6_info = rt;
3648         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3649         if (err) {
3650                 kfree(nh);
3651                 return err;
3652         }
3653         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3654         list_add_tail(&nh->next, rt6_nh_list);
3655
3656         return 0;
3657 }
3658
3659 static void ip6_route_mpath_notify(struct rt6_info *rt,
3660                                    struct rt6_info *rt_last,
3661                                    struct nl_info *info,
3662                                    __u16 nlflags)
3663 {
3664         /* if this is an APPEND route, then rt points to the first route
3665          * inserted and rt_last points to last route inserted. Userspace
3666          * wants a consistent dump of the route which starts at the first
3667          * nexthop. Since sibling routes are always added at the end of
3668          * the list, find the first sibling of the last route appended
3669          */
3670         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3671                 rt = list_first_entry(&rt_last->rt6i_siblings,
3672                                       struct rt6_info,
3673                                       rt6i_siblings);
3674         }
3675
3676         if (rt)
3677                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3678 }
3679
3680 static int ip6_route_multipath_add(struct fib6_config *cfg,
3681                                    struct netlink_ext_ack *extack)
3682 {
3683         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3684         struct nl_info *info = &cfg->fc_nlinfo;
3685         struct fib6_config r_cfg;
3686         struct rtnexthop *rtnh;
3687         struct rt6_info *rt;
3688         struct rt6_nh *err_nh;
3689         struct rt6_nh *nh, *nh_safe;
3690         __u16 nlflags;
3691         int remaining;
3692         int attrlen;
3693         int err = 1;
3694         int nhn = 0;
3695         int replace = (cfg->fc_nlinfo.nlh &&
3696                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3697         LIST_HEAD(rt6_nh_list);
3698
3699         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3700         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3701                 nlflags |= NLM_F_APPEND;
3702
3703         remaining = cfg->fc_mp_len;
3704         rtnh = (struct rtnexthop *)cfg->fc_mp;
3705
3706         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3707          * rt6_info structs per nexthop
3708          */
3709         while (rtnh_ok(rtnh, remaining)) {
3710                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3711                 if (rtnh->rtnh_ifindex)
3712                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3713
3714                 attrlen = rtnh_attrlen(rtnh);
3715                 if (attrlen > 0) {
3716                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3717
3718                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3719                         if (nla) {
3720                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3721                                 r_cfg.fc_flags |= RTF_GATEWAY;
3722                         }
3723                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3724                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3725                         if (nla)
3726                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3727                 }
3728
3729                 rt = ip6_route_info_create(&r_cfg, extack);
3730                 if (IS_ERR(rt)) {
3731                         err = PTR_ERR(rt);
3732                         rt = NULL;
3733                         goto cleanup;
3734                 }
3735
3736                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3737                 if (err) {
3738                         dst_release_immediate(&rt->dst);
3739                         goto cleanup;
3740                 }
3741
3742                 rtnh = rtnh_next(rtnh, &remaining);
3743         }
3744
3745         /* for add and replace send one notification with all nexthops.
3746          * Skip the notification in fib6_add_rt2node and send one with
3747          * the full route when done
3748          */
3749         info->skip_notify = 1;
3750
3751         err_nh = NULL;
3752         list_for_each_entry(nh, &rt6_nh_list, next) {
3753                 rt_last = nh->rt6_info;
3754                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3755                 /* save reference to first route for notification */
3756                 if (!rt_notif && !err)
3757                         rt_notif = nh->rt6_info;
3758
3759                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3760                 nh->rt6_info = NULL;
3761                 if (err) {
3762                         if (replace && nhn)
3763                                 ip6_print_replace_route_err(&rt6_nh_list);
3764                         err_nh = nh;
3765                         goto add_errout;
3766                 }
3767
3768                 /* Because each route is added like a single route we remove
3769                  * these flags after the first nexthop: if there is a collision,
3770                  * we have already failed to add the first nexthop:
3771                  * fib6_add_rt2node() has rejected it; when replacing, old
3772                  * nexthops have been replaced by first new, the rest should
3773                  * be added to it.
3774                  */
3775                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3776                                                      NLM_F_REPLACE);
3777                 nhn++;
3778         }
3779
3780         /* success ... tell user about new route */
3781         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3782         goto cleanup;
3783
3784 add_errout:
3785         /* send notification for routes that were added so that
3786          * the delete notifications sent by ip6_route_del are
3787          * coherent
3788          */
3789         if (rt_notif)
3790                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3791
3792         /* Delete routes that were already added */
3793         list_for_each_entry(nh, &rt6_nh_list, next) {
3794                 if (err_nh == nh)
3795                         break;
3796                 ip6_route_del(&nh->r_cfg, extack);
3797         }
3798
3799 cleanup:
3800         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3801                 if (nh->rt6_info)
3802                         dst_release_immediate(&nh->rt6_info->dst);
3803                 kfree(nh->mxc.mx);
3804                 list_del(&nh->next);
3805                 kfree(nh);
3806         }
3807
3808         return err;
3809 }
3810
3811 static int ip6_route_multipath_del(struct fib6_config *cfg,
3812                                    struct netlink_ext_ack *extack)
3813 {
3814         struct fib6_config r_cfg;
3815         struct rtnexthop *rtnh;
3816         int remaining;
3817         int attrlen;
3818         int err = 1, last_err = 0;
3819
3820         remaining = cfg->fc_mp_len;
3821         rtnh = (struct rtnexthop *)cfg->fc_mp;
3822
3823         /* Parse a Multipath Entry */
3824         while (rtnh_ok(rtnh, remaining)) {
3825                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3826                 if (rtnh->rtnh_ifindex)
3827                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3828
3829                 attrlen = rtnh_attrlen(rtnh);
3830                 if (attrlen > 0) {
3831                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3832
3833                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3834                         if (nla) {
3835                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3836                                 r_cfg.fc_flags |= RTF_GATEWAY;
3837                         }
3838                 }
3839                 err = ip6_route_del(&r_cfg, extack);
3840                 if (err)
3841                         last_err = err;
3842
3843                 rtnh = rtnh_next(rtnh, &remaining);
3844         }
3845
3846         return last_err;
3847 }
3848
3849 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3850                               struct netlink_ext_ack *extack)
3851 {
3852         struct fib6_config cfg;
3853         int err;
3854
3855         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3856         if (err < 0)
3857                 return err;
3858
3859         if (cfg.fc_mp)
3860                 return ip6_route_multipath_del(&cfg, extack);
3861         else {
3862                 cfg.fc_delete_all_nh = 1;
3863                 return ip6_route_del(&cfg, extack);
3864         }
3865 }
3866
3867 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3868                               struct netlink_ext_ack *extack)
3869 {
3870         struct fib6_config cfg;
3871         int err;
3872
3873         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3874         if (err < 0)
3875                 return err;
3876
3877         if (cfg.fc_mp)
3878                 return ip6_route_multipath_add(&cfg, extack);
3879         else
3880                 return ip6_route_add(&cfg, extack);
3881 }
3882
3883 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3884 {
3885         int nexthop_len = 0;
3886
3887         if (rt->rt6i_nsiblings) {
3888                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3889                             + NLA_ALIGN(sizeof(struct rtnexthop))
3890                             + nla_total_size(16) /* RTA_GATEWAY */
3891                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3892
3893                 nexthop_len *= rt->rt6i_nsiblings;
3894         }
3895
3896         return NLMSG_ALIGN(sizeof(struct rtmsg))
3897                + nla_total_size(16) /* RTA_SRC */
3898                + nla_total_size(16) /* RTA_DST */
3899                + nla_total_size(16) /* RTA_GATEWAY */
3900                + nla_total_size(16) /* RTA_PREFSRC */
3901                + nla_total_size(4) /* RTA_TABLE */
3902                + nla_total_size(4) /* RTA_IIF */
3903                + nla_total_size(4) /* RTA_OIF */
3904                + nla_total_size(4) /* RTA_PRIORITY */
3905                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3906                + nla_total_size(sizeof(struct rta_cacheinfo))
3907                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3908                + nla_total_size(1) /* RTA_PREF */
3909                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3910                + nexthop_len;
3911 }
3912
3913 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3914                             unsigned int *flags, bool skip_oif)
3915 {
3916         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3917                 *flags |= RTNH_F_LINKDOWN;
3918                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3919                         *flags |= RTNH_F_DEAD;
3920         }
3921
3922         if (rt->rt6i_flags & RTF_GATEWAY) {
3923                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3924                         goto nla_put_failure;
3925         }
3926
3927         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3928                 *flags |= RTNH_F_OFFLOAD;
3929
3930         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3931         if (!skip_oif && rt->dst.dev &&
3932             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3933                 goto nla_put_failure;
3934
3935         if (rt->dst.lwtstate &&
3936             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3937                 goto nla_put_failure;
3938
3939         return 0;
3940
3941 nla_put_failure:
3942         return -EMSGSIZE;
3943 }
3944
3945 /* add multipath next hop */
3946 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3947 {
3948         struct rtnexthop *rtnh;
3949         unsigned int flags = 0;
3950
3951         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3952         if (!rtnh)
3953                 goto nla_put_failure;
3954
3955         rtnh->rtnh_hops = 0;
3956         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3957
3958         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3959                 goto nla_put_failure;
3960
3961         rtnh->rtnh_flags = flags;
3962
3963         /* length of rtnetlink header + attributes */
3964         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3965
3966         return 0;
3967
3968 nla_put_failure:
3969         return -EMSGSIZE;
3970 }
3971
3972 static int rt6_fill_node(struct net *net,
3973                          struct sk_buff *skb, struct rt6_info *rt,
3974                          struct in6_addr *dst, struct in6_addr *src,
3975                          int iif, int type, u32 portid, u32 seq,
3976                          unsigned int flags)
3977 {
3978         u32 metrics[RTAX_MAX];
3979         struct rtmsg *rtm;
3980         struct nlmsghdr *nlh;
3981         long expires;
3982         u32 table;
3983
3984         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3985         if (!nlh)
3986                 return -EMSGSIZE;
3987
3988         rtm = nlmsg_data(nlh);
3989         rtm->rtm_family = AF_INET6;
3990         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3991         rtm->rtm_src_len = rt->rt6i_src.plen;
3992         rtm->rtm_tos = 0;
3993         if (rt->rt6i_table)
3994                 table = rt->rt6i_table->tb6_id;
3995         else
3996                 table = RT6_TABLE_UNSPEC;
3997         rtm->rtm_table = table;
3998         if (nla_put_u32(skb, RTA_TABLE, table))
3999                 goto nla_put_failure;
4000         if (rt->rt6i_flags & RTF_REJECT) {
4001                 switch (rt->dst.error) {
4002                 case -EINVAL:
4003                         rtm->rtm_type = RTN_BLACKHOLE;
4004                         break;
4005                 case -EACCES:
4006                         rtm->rtm_type = RTN_PROHIBIT;
4007                         break;
4008                 case -EAGAIN:
4009                         rtm->rtm_type = RTN_THROW;
4010                         break;
4011                 default:
4012                         rtm->rtm_type = RTN_UNREACHABLE;
4013                         break;
4014                 }
4015         }
4016         else if (rt->rt6i_flags & RTF_LOCAL)
4017                 rtm->rtm_type = RTN_LOCAL;
4018         else if (rt->rt6i_flags & RTF_ANYCAST)
4019                 rtm->rtm_type = RTN_ANYCAST;
4020         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4021                 rtm->rtm_type = RTN_LOCAL;
4022         else
4023                 rtm->rtm_type = RTN_UNICAST;
4024         rtm->rtm_flags = 0;
4025         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4026         rtm->rtm_protocol = rt->rt6i_protocol;
4027
4028         if (rt->rt6i_flags & RTF_CACHE)
4029                 rtm->rtm_flags |= RTM_F_CLONED;
4030
4031         if (dst) {
4032                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4033                         goto nla_put_failure;
4034                 rtm->rtm_dst_len = 128;
4035         } else if (rtm->rtm_dst_len)
4036                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4037                         goto nla_put_failure;
4038 #ifdef CONFIG_IPV6_SUBTREES
4039         if (src) {
4040                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4041                         goto nla_put_failure;
4042                 rtm->rtm_src_len = 128;
4043         } else if (rtm->rtm_src_len &&
4044                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4045                 goto nla_put_failure;
4046 #endif
4047         if (iif) {
4048 #ifdef CONFIG_IPV6_MROUTE
4049                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4050                         int err = ip6mr_get_route(net, skb, rtm, portid);
4051
4052                         if (err == 0)
4053                                 return 0;
4054                         if (err < 0)
4055                                 goto nla_put_failure;
4056                 } else
4057 #endif
4058                         if (nla_put_u32(skb, RTA_IIF, iif))
4059                                 goto nla_put_failure;
4060         } else if (dst) {
4061                 struct in6_addr saddr_buf;
4062                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4063                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4064                         goto nla_put_failure;
4065         }
4066
4067         if (rt->rt6i_prefsrc.plen) {
4068                 struct in6_addr saddr_buf;
4069                 saddr_buf = rt->rt6i_prefsrc.addr;
4070                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4071                         goto nla_put_failure;
4072         }
4073
4074         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4075         if (rt->rt6i_pmtu)
4076                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4077         if (rtnetlink_put_metrics(skb, metrics) < 0)
4078                 goto nla_put_failure;
4079
4080         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4081                 goto nla_put_failure;
4082
4083         /* For multipath routes, walk the siblings list and add
4084          * each as a nexthop within RTA_MULTIPATH.
4085          */
4086         if (rt->rt6i_nsiblings) {
4087                 struct rt6_info *sibling, *next_sibling;
4088                 struct nlattr *mp;
4089
4090                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4091                 if (!mp)
4092                         goto nla_put_failure;
4093
4094                 if (rt6_add_nexthop(skb, rt) < 0)
4095                         goto nla_put_failure;
4096
4097                 list_for_each_entry_safe(sibling, next_sibling,
4098                                          &rt->rt6i_siblings, rt6i_siblings) {
4099                         if (rt6_add_nexthop(skb, sibling) < 0)
4100                                 goto nla_put_failure;
4101                 }
4102
4103                 nla_nest_end(skb, mp);
4104         } else {
4105                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4106                         goto nla_put_failure;
4107         }
4108
4109         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4110
4111         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4112                 goto nla_put_failure;
4113
4114         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4115                 goto nla_put_failure;
4116
4117
4118         nlmsg_end(skb, nlh);
4119         return 0;
4120
4121 nla_put_failure:
4122         nlmsg_cancel(skb, nlh);
4123         return -EMSGSIZE;
4124 }
4125
4126 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4127 {
4128         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4129         struct net *net = arg->net;
4130
4131         if (rt == net->ipv6.ip6_null_entry)
4132                 return 0;
4133
4134         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4135                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4136
4137                 /* user wants prefix routes only */
4138                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4139                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4140                         /* success since this is not a prefix route */
4141                         return 1;
4142                 }
4143         }
4144
4145         return rt6_fill_node(net,
4146                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4147                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4148                      NLM_F_MULTI);
4149 }
4150
4151 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4152                               struct netlink_ext_ack *extack)
4153 {
4154         struct net *net = sock_net(in_skb->sk);
4155         struct nlattr *tb[RTA_MAX+1];
4156         int err, iif = 0, oif = 0;
4157         struct dst_entry *dst;
4158         struct rt6_info *rt;
4159         struct sk_buff *skb;
4160         struct rtmsg *rtm;
4161         struct flowi6 fl6;
4162         bool fibmatch;
4163
4164         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4165                           extack);
4166         if (err < 0)
4167                 goto errout;
4168
4169         err = -EINVAL;
4170         memset(&fl6, 0, sizeof(fl6));
4171         rtm = nlmsg_data(nlh);
4172         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4173         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4174
4175         if (tb[RTA_SRC]) {
4176                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4177                         goto errout;
4178
4179                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4180         }
4181
4182         if (tb[RTA_DST]) {
4183                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4184                         goto errout;
4185
4186                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4187         }
4188
4189         if (tb[RTA_IIF])
4190                 iif = nla_get_u32(tb[RTA_IIF]);
4191
4192         if (tb[RTA_OIF])
4193                 oif = nla_get_u32(tb[RTA_OIF]);
4194
4195         if (tb[RTA_MARK])
4196                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4197
4198         if (tb[RTA_UID])
4199                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4200                                            nla_get_u32(tb[RTA_UID]));
4201         else
4202                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4203
4204         if (iif) {
4205                 struct net_device *dev;
4206                 int flags = 0;
4207
4208                 rcu_read_lock();
4209
4210                 dev = dev_get_by_index_rcu(net, iif);
4211                 if (!dev) {
4212                         rcu_read_unlock();
4213                         err = -ENODEV;
4214                         goto errout;
4215                 }
4216
4217                 fl6.flowi6_iif = iif;
4218
4219                 if (!ipv6_addr_any(&fl6.saddr))
4220                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4221
4222                 if (!fibmatch)
4223                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4224                 else
4225                         dst = ip6_route_lookup(net, &fl6, 0);
4226
4227                 rcu_read_unlock();
4228         } else {
4229                 fl6.flowi6_oif = oif;
4230
4231                 if (!fibmatch)
4232                         dst = ip6_route_output(net, NULL, &fl6);
4233                 else
4234                         dst = ip6_route_lookup(net, &fl6, 0);
4235         }
4236
4237
4238         rt = container_of(dst, struct rt6_info, dst);
4239         if (rt->dst.error) {
4240                 err = rt->dst.error;
4241                 ip6_rt_put(rt);
4242                 goto errout;
4243         }
4244
4245         if (rt == net->ipv6.ip6_null_entry) {
4246                 err = rt->dst.error;
4247                 ip6_rt_put(rt);
4248                 goto errout;
4249         }
4250
4251         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4252         if (!skb) {
4253                 ip6_rt_put(rt);
4254                 err = -ENOBUFS;
4255                 goto errout;
4256         }
4257
4258         skb_dst_set(skb, &rt->dst);
4259         if (fibmatch)
4260                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4261                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4262                                     nlh->nlmsg_seq, 0);
4263         else
4264                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4265                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4266                                     nlh->nlmsg_seq, 0);
4267         if (err < 0) {
4268                 kfree_skb(skb);
4269                 goto errout;
4270         }
4271
4272         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4273 errout:
4274         return err;
4275 }
4276
4277 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4278                      unsigned int nlm_flags)
4279 {
4280         struct sk_buff *skb;
4281         struct net *net = info->nl_net;
4282         u32 seq;
4283         int err;
4284
4285         err = -ENOBUFS;
4286         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4287
4288         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4289         if (!skb)
4290                 goto errout;
4291
4292         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4293                                 event, info->portid, seq, nlm_flags);
4294         if (err < 0) {
4295                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4296                 WARN_ON(err == -EMSGSIZE);
4297                 kfree_skb(skb);
4298                 goto errout;
4299         }
4300         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4301                     info->nlh, gfp_any());
4302         return;
4303 errout:
4304         if (err < 0)
4305                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4306 }
4307
4308 static int ip6_route_dev_notify(struct notifier_block *this,
4309                                 unsigned long event, void *ptr)
4310 {
4311         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4312         struct net *net = dev_net(dev);
4313
4314         if (!(dev->flags & IFF_LOOPBACK))
4315                 return NOTIFY_OK;
4316
4317         if (event == NETDEV_REGISTER) {
4318                 net->ipv6.ip6_null_entry->dst.dev = dev;
4319                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4321                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4322                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4323                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4324                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4325 #endif
4326          } else if (event == NETDEV_UNREGISTER &&
4327                     dev->reg_state != NETREG_UNREGISTERED) {
4328                 /* NETDEV_UNREGISTER could be fired for multiple times by
4329                  * netdev_wait_allrefs(). Make sure we only call this once.
4330                  */
4331                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4332 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4333                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4334                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4335 #endif
4336         }
4337
4338         return NOTIFY_OK;
4339 }
4340
4341 /*
4342  *      /proc
4343  */
4344
4345 #ifdef CONFIG_PROC_FS
4346
4347 static const struct file_operations ipv6_route_proc_fops = {
4348         .owner          = THIS_MODULE,
4349         .open           = ipv6_route_open,
4350         .read           = seq_read,
4351         .llseek         = seq_lseek,
4352         .release        = seq_release_net,
4353 };
4354
4355 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4356 {
4357         struct net *net = (struct net *)seq->private;
4358         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4359                    net->ipv6.rt6_stats->fib_nodes,
4360                    net->ipv6.rt6_stats->fib_route_nodes,
4361                    net->ipv6.rt6_stats->fib_rt_alloc,
4362                    net->ipv6.rt6_stats->fib_rt_entries,
4363                    net->ipv6.rt6_stats->fib_rt_cache,
4364                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4365                    net->ipv6.rt6_stats->fib_discarded_routes);
4366
4367         return 0;
4368 }
4369
4370 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4371 {
4372         return single_open_net(inode, file, rt6_stats_seq_show);
4373 }
4374
4375 static const struct file_operations rt6_stats_seq_fops = {
4376         .owner   = THIS_MODULE,
4377         .open    = rt6_stats_seq_open,
4378         .read    = seq_read,
4379         .llseek  = seq_lseek,
4380         .release = single_release_net,
4381 };
4382 #endif  /* CONFIG_PROC_FS */
4383
4384 #ifdef CONFIG_SYSCTL
4385
4386 static
4387 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4388                               void __user *buffer, size_t *lenp, loff_t *ppos)
4389 {
4390         struct net *net;
4391         int delay;
4392         if (!write)
4393                 return -EINVAL;
4394
4395         net = (struct net *)ctl->extra1;
4396         delay = net->ipv6.sysctl.flush_delay;
4397         proc_dointvec(ctl, write, buffer, lenp, ppos);
4398         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4399         return 0;
4400 }
4401
4402 struct ctl_table ipv6_route_table_template[] = {
4403         {
4404                 .procname       =       "flush",
4405                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4406                 .maxlen         =       sizeof(int),
4407                 .mode           =       0200,
4408                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4409         },
4410         {
4411                 .procname       =       "gc_thresh",
4412                 .data           =       &ip6_dst_ops_template.gc_thresh,
4413                 .maxlen         =       sizeof(int),
4414                 .mode           =       0644,
4415                 .proc_handler   =       proc_dointvec,
4416         },
4417         {
4418                 .procname       =       "max_size",
4419                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4420                 .maxlen         =       sizeof(int),
4421                 .mode           =       0644,
4422                 .proc_handler   =       proc_dointvec,
4423         },
4424         {
4425                 .procname       =       "gc_min_interval",
4426                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4427                 .maxlen         =       sizeof(int),
4428                 .mode           =       0644,
4429                 .proc_handler   =       proc_dointvec_jiffies,
4430         },
4431         {
4432                 .procname       =       "gc_timeout",
4433                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4434                 .maxlen         =       sizeof(int),
4435                 .mode           =       0644,
4436                 .proc_handler   =       proc_dointvec_jiffies,
4437         },
4438         {
4439                 .procname       =       "gc_interval",
4440                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4441                 .maxlen         =       sizeof(int),
4442                 .mode           =       0644,
4443                 .proc_handler   =       proc_dointvec_jiffies,
4444         },
4445         {
4446                 .procname       =       "gc_elasticity",
4447                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4448                 .maxlen         =       sizeof(int),
4449                 .mode           =       0644,
4450                 .proc_handler   =       proc_dointvec,
4451         },
4452         {
4453                 .procname       =       "mtu_expires",
4454                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4455                 .maxlen         =       sizeof(int),
4456                 .mode           =       0644,
4457                 .proc_handler   =       proc_dointvec_jiffies,
4458         },
4459         {
4460                 .procname       =       "min_adv_mss",
4461                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4462                 .maxlen         =       sizeof(int),
4463                 .mode           =       0644,
4464                 .proc_handler   =       proc_dointvec,
4465         },
4466         {
4467                 .procname       =       "gc_min_interval_ms",
4468                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4469                 .maxlen         =       sizeof(int),
4470                 .mode           =       0644,
4471                 .proc_handler   =       proc_dointvec_ms_jiffies,
4472         },
4473         { }
4474 };
4475
4476 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4477 {
4478         struct ctl_table *table;
4479
4480         table = kmemdup(ipv6_route_table_template,
4481                         sizeof(ipv6_route_table_template),
4482                         GFP_KERNEL);
4483
4484         if (table) {
4485                 table[0].data = &net->ipv6.sysctl.flush_delay;
4486                 table[0].extra1 = net;
4487                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4488                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4489                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4490                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4491                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4492                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4493                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4494                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4495                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4496
4497                 /* Don't export sysctls to unprivileged users */
4498                 if (net->user_ns != &init_user_ns)
4499                         table[0].procname = NULL;
4500         }
4501
4502         return table;
4503 }
4504 #endif
4505
4506 static int __net_init ip6_route_net_init(struct net *net)
4507 {
4508         int ret = -ENOMEM;
4509
4510         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4511                sizeof(net->ipv6.ip6_dst_ops));
4512
4513         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4514                 goto out_ip6_dst_ops;
4515
4516         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4517                                            sizeof(*net->ipv6.ip6_null_entry),
4518                                            GFP_KERNEL);
4519         if (!net->ipv6.ip6_null_entry)
4520                 goto out_ip6_dst_entries;
4521         net->ipv6.ip6_null_entry->dst.path =
4522                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4523         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4524         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4525                          ip6_template_metrics, true);
4526
4527 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4528         net->ipv6.fib6_has_custom_rules = false;
4529         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4530                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4531                                                GFP_KERNEL);
4532         if (!net->ipv6.ip6_prohibit_entry)
4533                 goto out_ip6_null_entry;
4534         net->ipv6.ip6_prohibit_entry->dst.path =
4535                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4536         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4537         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4538                          ip6_template_metrics, true);
4539
4540         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4541                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4542                                                GFP_KERNEL);
4543         if (!net->ipv6.ip6_blk_hole_entry)
4544                 goto out_ip6_prohibit_entry;
4545         net->ipv6.ip6_blk_hole_entry->dst.path =
4546                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4547         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4548         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4549                          ip6_template_metrics, true);
4550 #endif
4551
4552         net->ipv6.sysctl.flush_delay = 0;
4553         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4554         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4555         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4556         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4557         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4558         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4559         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4560
4561         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4562
4563         ret = 0;
4564 out:
4565         return ret;
4566
4567 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4568 out_ip6_prohibit_entry:
4569         kfree(net->ipv6.ip6_prohibit_entry);
4570 out_ip6_null_entry:
4571         kfree(net->ipv6.ip6_null_entry);
4572 #endif
4573 out_ip6_dst_entries:
4574         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4575 out_ip6_dst_ops:
4576         goto out;
4577 }
4578
4579 static void __net_exit ip6_route_net_exit(struct net *net)
4580 {
4581         kfree(net->ipv6.ip6_null_entry);
4582 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4583         kfree(net->ipv6.ip6_prohibit_entry);
4584         kfree(net->ipv6.ip6_blk_hole_entry);
4585 #endif
4586         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4587 }
4588
4589 static int __net_init ip6_route_net_init_late(struct net *net)
4590 {
4591 #ifdef CONFIG_PROC_FS
4592         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4593         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4594 #endif
4595         return 0;
4596 }
4597
4598 static void __net_exit ip6_route_net_exit_late(struct net *net)
4599 {
4600 #ifdef CONFIG_PROC_FS
4601         remove_proc_entry("ipv6_route", net->proc_net);
4602         remove_proc_entry("rt6_stats", net->proc_net);
4603 #endif
4604 }
4605
4606 static struct pernet_operations ip6_route_net_ops = {
4607         .init = ip6_route_net_init,
4608         .exit = ip6_route_net_exit,
4609 };
4610
4611 static int __net_init ipv6_inetpeer_init(struct net *net)
4612 {
4613         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4614
4615         if (!bp)
4616                 return -ENOMEM;
4617         inet_peer_base_init(bp);
4618         net->ipv6.peers = bp;
4619         return 0;
4620 }
4621
4622 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4623 {
4624         struct inet_peer_base *bp = net->ipv6.peers;
4625
4626         net->ipv6.peers = NULL;
4627         inetpeer_invalidate_tree(bp);
4628         kfree(bp);
4629 }
4630
4631 static struct pernet_operations ipv6_inetpeer_ops = {
4632         .init   =       ipv6_inetpeer_init,
4633         .exit   =       ipv6_inetpeer_exit,
4634 };
4635
4636 static struct pernet_operations ip6_route_net_late_ops = {
4637         .init = ip6_route_net_init_late,
4638         .exit = ip6_route_net_exit_late,
4639 };
4640
4641 static struct notifier_block ip6_route_dev_notifier = {
4642         .notifier_call = ip6_route_dev_notify,
4643         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4644 };
4645
4646 void __init ip6_route_init_special_entries(void)
4647 {
4648         /* Registering of the loopback is done before this portion of code,
4649          * the loopback reference in rt6_info will not be taken, do it
4650          * manually for init_net */
4651         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4652         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4653   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4654         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4655         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4656         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4657         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4658   #endif
4659 }
4660
4661 int __init ip6_route_init(void)
4662 {
4663         int ret;
4664         int cpu;
4665
4666         ret = -ENOMEM;
4667         ip6_dst_ops_template.kmem_cachep =
4668                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4669                                   SLAB_HWCACHE_ALIGN, NULL);
4670         if (!ip6_dst_ops_template.kmem_cachep)
4671                 goto out;
4672
4673         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4674         if (ret)
4675                 goto out_kmem_cache;
4676
4677         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4678         if (ret)
4679                 goto out_dst_entries;
4680
4681         ret = register_pernet_subsys(&ip6_route_net_ops);
4682         if (ret)
4683                 goto out_register_inetpeer;
4684
4685         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4686
4687         ret = fib6_init();
4688         if (ret)
4689                 goto out_register_subsys;
4690
4691         ret = xfrm6_init();
4692         if (ret)
4693                 goto out_fib6_init;
4694
4695         ret = fib6_rules_init();
4696         if (ret)
4697                 goto xfrm6_init;
4698
4699         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4700         if (ret)
4701                 goto fib6_rules_init;
4702
4703         ret = -ENOBUFS;
4704         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4705             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4706             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4707                             RTNL_FLAG_DOIT_UNLOCKED))
4708                 goto out_register_late_subsys;
4709
4710         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4711         if (ret)
4712                 goto out_register_late_subsys;
4713
4714         for_each_possible_cpu(cpu) {
4715                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4716
4717                 INIT_LIST_HEAD(&ul->head);
4718                 spin_lock_init(&ul->lock);
4719         }
4720
4721 out:
4722         return ret;
4723
4724 out_register_late_subsys:
4725         unregister_pernet_subsys(&ip6_route_net_late_ops);
4726 fib6_rules_init:
4727         fib6_rules_cleanup();
4728 xfrm6_init:
4729         xfrm6_fini();
4730 out_fib6_init:
4731         fib6_gc_cleanup();
4732 out_register_subsys:
4733         unregister_pernet_subsys(&ip6_route_net_ops);
4734 out_register_inetpeer:
4735         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4736 out_dst_entries:
4737         dst_entries_destroy(&ip6_dst_blackhole_ops);
4738 out_kmem_cache:
4739         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4740         goto out;
4741 }
4742
4743 void ip6_route_cleanup(void)
4744 {
4745         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4746         unregister_pernet_subsys(&ip6_route_net_late_ops);
4747         fib6_rules_cleanup();
4748         xfrm6_fini();
4749         fib6_gc_cleanup();
4750         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4751         unregister_pernet_subsys(&ip6_route_net_ops);
4752         dst_entries_destroy(&ip6_dst_blackhole_ops);
4753         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4754 }