]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: introduce a hash table to store dst cache
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146
147                 spin_lock_bh(&ul->lock);
148                 list_del(&rt->rt6i_uncached);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
186 {
187         return dst_metrics_write_ptr(rt->dst.from);
188 }
189
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
191 {
192         struct rt6_info *rt = (struct rt6_info *)dst;
193
194         if (rt->rt6i_flags & RTF_PCPU)
195                 return rt6_pcpu_cow_metrics(rt);
196         else if (rt->rt6i_flags & RTF_CACHE)
197                 return NULL;
198         else
199                 return dst_cow_metrics_generic(dst, old);
200 }
201
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203                                              struct sk_buff *skb,
204                                              const void *daddr)
205 {
206         struct in6_addr *p = &rt->rt6i_gateway;
207
208         if (!ipv6_addr_any(p))
209                 return (const void *) p;
210         else if (skb)
211                 return &ipv6_hdr(skb)->daddr;
212         return daddr;
213 }
214
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216                                           struct sk_buff *skb,
217                                           const void *daddr)
218 {
219         struct rt6_info *rt = (struct rt6_info *) dst;
220         struct neighbour *n;
221
222         daddr = choose_neigh_daddr(rt, skb, daddr);
223         n = __ipv6_neigh_lookup(dst->dev, daddr);
224         if (n)
225                 return n;
226         return neigh_create(&nd_tbl, daddr, dst->dev);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(rt, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       ipv6_cow_metrics,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu)
272 {
273 }
274
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276                                       struct sk_buff *skb)
277 {
278 }
279
280 static struct dst_ops ip6_dst_blackhole_ops = {
281         .family                 =       AF_INET6,
282         .destroy                =       ip6_dst_destroy,
283         .check                  =       ip6_dst_check,
284         .mtu                    =       ip6_blackhole_mtu,
285         .default_advmss         =       ip6_default_advmss,
286         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
287         .redirect               =       ip6_rt_blackhole_redirect,
288         .cow_metrics            =       dst_cow_metrics_generic,
289         .neigh_lookup           =       ip6_neigh_lookup,
290 };
291
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293         [RTAX_HOPLIMIT - 1] = 0,
294 };
295
296 static const struct rt6_info ip6_null_entry_template = {
297         .dst = {
298                 .__refcnt       = ATOMIC_INIT(1),
299                 .__use          = 1,
300                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
301                 .error          = -ENETUNREACH,
302                 .input          = ip6_pkt_discard,
303                 .output         = ip6_pkt_discard_out,
304         },
305         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
306         .rt6i_protocol  = RTPROT_KERNEL,
307         .rt6i_metric    = ~(u32) 0,
308         .rt6i_ref       = ATOMIC_INIT(1),
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326 };
327
328 static const struct rt6_info ip6_blk_hole_entry_template = {
329         .dst = {
330                 .__refcnt       = ATOMIC_INIT(1),
331                 .__use          = 1,
332                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
333                 .error          = -EINVAL,
334                 .input          = dst_discard,
335                 .output         = dst_discard_out,
336         },
337         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
338         .rt6i_protocol  = RTPROT_KERNEL,
339         .rt6i_metric    = ~(u32) 0,
340         .rt6i_ref       = ATOMIC_INIT(1),
341 };
342
343 #endif
344
345 static void rt6_info_init(struct rt6_info *rt)
346 {
347         struct dst_entry *dst = &rt->dst;
348
349         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350         INIT_LIST_HEAD(&rt->rt6i_siblings);
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356                                         struct net_device *dev,
357                                         int flags)
358 {
359         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360                                         1, DST_OBSOLETE_FORCE_CHK, flags);
361
362         if (rt)
363                 rt6_info_init(rt);
364
365         return rt;
366 }
367
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369                                struct net_device *dev,
370                                int flags)
371 {
372         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373
374         if (rt) {
375                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
376                 if (rt->rt6i_pcpu) {
377                         int cpu;
378
379                         for_each_possible_cpu(cpu) {
380                                 struct rt6_info **p;
381
382                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383                                 /* no one shares rt */
384                                 *p =  NULL;
385                         }
386                 } else {
387                         dst_release_immediate(&rt->dst);
388                         return NULL;
389                 }
390         }
391
392         return rt;
393 }
394 EXPORT_SYMBOL(ip6_dst_alloc);
395
396 static void ip6_dst_destroy(struct dst_entry *dst)
397 {
398         struct rt6_info *rt = (struct rt6_info *)dst;
399         struct rt6_exception_bucket *bucket;
400         struct dst_entry *from = dst->from;
401         struct inet6_dev *idev;
402
403         dst_destroy_metrics_generic(dst);
404         free_percpu(rt->rt6i_pcpu);
405         rt6_uncached_list_del(rt);
406
407         idev = rt->rt6i_idev;
408         if (idev) {
409                 rt->rt6i_idev = NULL;
410                 in6_dev_put(idev);
411         }
412         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
413         if (bucket) {
414                 rt->rt6i_exception_bucket = NULL;
415                 kfree(bucket);
416         }
417
418         dst->from = NULL;
419         dst_release(from);
420 }
421
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
423                            int how)
424 {
425         struct rt6_info *rt = (struct rt6_info *)dst;
426         struct inet6_dev *idev = rt->rt6i_idev;
427         struct net_device *loopback_dev =
428                 dev_net(dev)->loopback_dev;
429
430         if (idev && idev->dev != loopback_dev) {
431                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
432                 if (loopback_idev) {
433                         rt->rt6i_idev = loopback_idev;
434                         in6_dev_put(idev);
435                 }
436         }
437 }
438
439 static bool __rt6_check_expired(const struct rt6_info *rt)
440 {
441         if (rt->rt6i_flags & RTF_EXPIRES)
442                 return time_after(jiffies, rt->dst.expires);
443         else
444                 return false;
445 }
446
447 static bool rt6_check_expired(const struct rt6_info *rt)
448 {
449         if (rt->rt6i_flags & RTF_EXPIRES) {
450                 if (time_after(jiffies, rt->dst.expires))
451                         return true;
452         } else if (rt->dst.from) {
453                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454                        rt6_check_expired((struct rt6_info *)rt->dst.from);
455         }
456         return false;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         /* We might have already computed the hash for ICMPv6 errors. In such
467          * case it will always be non-zero. Otherwise now is the time to do it.
468          */
469         if (!fl6->mp_hash)
470                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
471
472         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473         /* Don't change the route, if route_choosen == 0
474          * (siblings does not include ourself)
475          */
476         if (route_choosen)
477                 list_for_each_entry_safe(sibling, next_sibling,
478                                 &match->rt6i_siblings, rt6i_siblings) {
479                         route_choosen--;
480                         if (route_choosen == 0) {
481                                 if (rt6_score_route(sibling, oif, strict) < 0)
482                                         break;
483                                 match = sibling;
484                                 break;
485                         }
486                 }
487         return match;
488 }
489
490 /*
491  *      Route lookup. Any table->tb6_lock is implied.
492  */
493
494 static inline struct rt6_info *rt6_device_match(struct net *net,
495                                                     struct rt6_info *rt,
496                                                     const struct in6_addr *saddr,
497                                                     int oif,
498                                                     int flags)
499 {
500         struct rt6_info *local = NULL;
501         struct rt6_info *sprt;
502
503         if (!oif && ipv6_addr_any(saddr))
504                 goto out;
505
506         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507                 struct net_device *dev = sprt->dst.dev;
508
509                 if (oif) {
510                         if (dev->ifindex == oif)
511                                 return sprt;
512                         if (dev->flags & IFF_LOOPBACK) {
513                                 if (!sprt->rt6i_idev ||
514                                     sprt->rt6i_idev->dev->ifindex != oif) {
515                                         if (flags & RT6_LOOKUP_F_IFACE)
516                                                 continue;
517                                         if (local &&
518                                             local->rt6i_idev->dev->ifindex == oif)
519                                                 continue;
520                                 }
521                                 local = sprt;
522                         }
523                 } else {
524                         if (ipv6_chk_addr(net, saddr, dev,
525                                           flags & RT6_LOOKUP_F_IFACE))
526                                 return sprt;
527                 }
528         }
529
530         if (oif) {
531                 if (local)
532                         return local;
533
534                 if (flags & RT6_LOOKUP_F_IFACE)
535                         return net->ipv6.ip6_null_entry;
536         }
537 out:
538         return rt;
539 }
540
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543         struct work_struct work;
544         struct in6_addr target;
545         struct net_device *dev;
546 };
547
548 static void rt6_probe_deferred(struct work_struct *w)
549 {
550         struct in6_addr mcaddr;
551         struct __rt6_probe_work *work =
552                 container_of(w, struct __rt6_probe_work, work);
553
554         addrconf_addr_solict_mult(&work->target, &mcaddr);
555         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556         dev_put(work->dev);
557         kfree(work);
558 }
559
560 static void rt6_probe(struct rt6_info *rt)
561 {
562         struct __rt6_probe_work *work;
563         struct neighbour *neigh;
564         /*
565          * Okay, this does not seem to be appropriate
566          * for now, however, we need to check if it
567          * is really so; aka Router Reachability Probing.
568          *
569          * Router Reachability Probe MUST be rate-limited
570          * to no more than one per minute.
571          */
572         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
573                 return;
574         rcu_read_lock_bh();
575         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
576         if (neigh) {
577                 if (neigh->nud_state & NUD_VALID)
578                         goto out;
579
580                 work = NULL;
581                 write_lock(&neigh->lock);
582                 if (!(neigh->nud_state & NUD_VALID) &&
583                     time_after(jiffies,
584                                neigh->updated +
585                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
586                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
587                         if (work)
588                                 __neigh_set_probe_once(neigh);
589                 }
590                 write_unlock(&neigh->lock);
591         } else {
592                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
593         }
594
595         if (work) {
596                 INIT_WORK(&work->work, rt6_probe_deferred);
597                 work->target = rt->rt6i_gateway;
598                 dev_hold(rt->dst.dev);
599                 work->dev = rt->dst.dev;
600                 schedule_work(&work->work);
601         }
602
603 out:
604         rcu_read_unlock_bh();
605 }
606 #else
607 static inline void rt6_probe(struct rt6_info *rt)
608 {
609 }
610 #endif
611
612 /*
613  * Default Router Selection (RFC 2461 6.3.6)
614  */
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
616 {
617         struct net_device *dev = rt->dst.dev;
618         if (!oif || dev->ifindex == oif)
619                 return 2;
620         if ((dev->flags & IFF_LOOPBACK) &&
621             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622                 return 1;
623         return 0;
624 }
625
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
627 {
628         struct neighbour *neigh;
629         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
630
631         if (rt->rt6i_flags & RTF_NONEXTHOP ||
632             !(rt->rt6i_flags & RTF_GATEWAY))
633                 return RT6_NUD_SUCCEED;
634
635         rcu_read_lock_bh();
636         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
637         if (neigh) {
638                 read_lock(&neigh->lock);
639                 if (neigh->nud_state & NUD_VALID)
640                         ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642                 else if (!(neigh->nud_state & NUD_FAILED))
643                         ret = RT6_NUD_SUCCEED;
644                 else
645                         ret = RT6_NUD_FAIL_PROBE;
646 #endif
647                 read_unlock(&neigh->lock);
648         } else {
649                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
651         }
652         rcu_read_unlock_bh();
653
654         return ret;
655 }
656
657 static int rt6_score_route(struct rt6_info *rt, int oif,
658                            int strict)
659 {
660         int m;
661
662         m = rt6_check_dev(rt, oif);
663         if (!m && (strict & RT6_LOOKUP_F_IFACE))
664                 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
667 #endif
668         if (strict & RT6_LOOKUP_F_REACHABLE) {
669                 int n = rt6_check_neigh(rt);
670                 if (n < 0)
671                         return n;
672         }
673         return m;
674 }
675
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677                                    int *mpri, struct rt6_info *match,
678                                    bool *do_rr)
679 {
680         int m;
681         bool match_do_rr = false;
682         struct inet6_dev *idev = rt->rt6i_idev;
683         struct net_device *dev = rt->dst.dev;
684
685         if (dev && !netif_carrier_ok(dev) &&
686             idev->cnf.ignore_routes_with_linkdown &&
687             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688                 goto out;
689
690         if (rt6_check_expired(rt))
691                 goto out;
692
693         m = rt6_score_route(rt, oif, strict);
694         if (m == RT6_NUD_FAIL_DO_RR) {
695                 match_do_rr = true;
696                 m = 0; /* lowest valid score */
697         } else if (m == RT6_NUD_FAIL_HARD) {
698                 goto out;
699         }
700
701         if (strict & RT6_LOOKUP_F_REACHABLE)
702                 rt6_probe(rt);
703
704         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
705         if (m > *mpri) {
706                 *do_rr = match_do_rr;
707                 *mpri = m;
708                 match = rt;
709         }
710 out:
711         return match;
712 }
713
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715                                      struct rt6_info *rr_head,
716                                      u32 metric, int oif, int strict,
717                                      bool *do_rr)
718 {
719         struct rt6_info *rt, *match, *cont;
720         int mpri = -1;
721
722         match = NULL;
723         cont = NULL;
724         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
725                 if (rt->rt6i_metric != metric) {
726                         cont = rt;
727                         break;
728                 }
729
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731         }
732
733         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rt->dst.rt6_next)
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
752 {
753         struct rt6_info *match, *rt0;
754         struct net *net;
755         bool do_rr = false;
756
757         rt0 = fn->rr_ptr;
758         if (!rt0)
759                 fn->rr_ptr = rt0 = fn->leaf;
760
761         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct rt6_info *next = rt0->dst.rt6_next;
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
769                         next = fn->leaf;
770
771                 if (next != rt0)
772                         fn->rr_ptr = next;
773         }
774
775         net = dev_net(rt0->dst.dev);
776         return match ? match : net->ipv6.ip6_null_entry;
777 }
778
779 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
780 {
781         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
782 }
783
784 #ifdef CONFIG_IPV6_ROUTE_INFO
785 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
786                   const struct in6_addr *gwaddr)
787 {
788         struct net *net = dev_net(dev);
789         struct route_info *rinfo = (struct route_info *) opt;
790         struct in6_addr prefix_buf, *prefix;
791         unsigned int pref;
792         unsigned long lifetime;
793         struct rt6_info *rt;
794
795         if (len < sizeof(struct route_info)) {
796                 return -EINVAL;
797         }
798
799         /* Sanity check for prefix_len and length */
800         if (rinfo->length > 3) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 128) {
803                 return -EINVAL;
804         } else if (rinfo->prefix_len > 64) {
805                 if (rinfo->length < 2) {
806                         return -EINVAL;
807                 }
808         } else if (rinfo->prefix_len > 0) {
809                 if (rinfo->length < 1) {
810                         return -EINVAL;
811                 }
812         }
813
814         pref = rinfo->route_pref;
815         if (pref == ICMPV6_ROUTER_PREF_INVALID)
816                 return -EINVAL;
817
818         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
819
820         if (rinfo->length == 3)
821                 prefix = (struct in6_addr *)rinfo->prefix;
822         else {
823                 /* this function is safe */
824                 ipv6_addr_prefix(&prefix_buf,
825                                  (struct in6_addr *)rinfo->prefix,
826                                  rinfo->prefix_len);
827                 prefix = &prefix_buf;
828         }
829
830         if (rinfo->prefix_len == 0)
831                 rt = rt6_get_dflt_router(gwaddr, dev);
832         else
833                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
834                                         gwaddr, dev);
835
836         if (rt && !lifetime) {
837                 ip6_del_rt(rt);
838                 rt = NULL;
839         }
840
841         if (!rt && lifetime)
842                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
843                                         dev, pref);
844         else if (rt)
845                 rt->rt6i_flags = RTF_ROUTEINFO |
846                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
847
848         if (rt) {
849                 if (!addrconf_finite_timeout(lifetime))
850                         rt6_clean_expires(rt);
851                 else
852                         rt6_set_expires(rt, jiffies + HZ * lifetime);
853
854                 ip6_rt_put(rt);
855         }
856         return 0;
857 }
858 #endif
859
860 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
861                                         struct in6_addr *saddr)
862 {
863         struct fib6_node *pn;
864         while (1) {
865                 if (fn->fn_flags & RTN_TL_ROOT)
866                         return NULL;
867                 pn = fn->parent;
868                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
869                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
870                 else
871                         fn = pn;
872                 if (fn->fn_flags & RTN_RTINFO)
873                         return fn;
874         }
875 }
876
877 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
878                                              struct fib6_table *table,
879                                              struct flowi6 *fl6, int flags)
880 {
881         struct fib6_node *fn;
882         struct rt6_info *rt;
883
884         read_lock_bh(&table->tb6_lock);
885         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
886 restart:
887         rt = fn->leaf;
888         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
889         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
890                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
891         if (rt == net->ipv6.ip6_null_entry) {
892                 fn = fib6_backtrack(fn, &fl6->saddr);
893                 if (fn)
894                         goto restart;
895         }
896         dst_use(&rt->dst, jiffies);
897         read_unlock_bh(&table->tb6_lock);
898
899         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
900
901         return rt;
902
903 }
904
905 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
906                                     int flags)
907 {
908         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
909 }
910 EXPORT_SYMBOL_GPL(ip6_route_lookup);
911
912 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
913                             const struct in6_addr *saddr, int oif, int strict)
914 {
915         struct flowi6 fl6 = {
916                 .flowi6_oif = oif,
917                 .daddr = *daddr,
918         };
919         struct dst_entry *dst;
920         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
921
922         if (saddr) {
923                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
924                 flags |= RT6_LOOKUP_F_HAS_SADDR;
925         }
926
927         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
928         if (dst->error == 0)
929                 return (struct rt6_info *) dst;
930
931         dst_release(dst);
932
933         return NULL;
934 }
935 EXPORT_SYMBOL(rt6_lookup);
936
937 /* ip6_ins_rt is called with FREE table->tb6_lock.
938  * It takes new route entry, the addition fails by any reason the
939  * route is released.
940  * Caller must hold dst before calling it.
941  */
942
943 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
944                         struct mx6_config *mxc,
945                         struct netlink_ext_ack *extack)
946 {
947         int err;
948         struct fib6_table *table;
949
950         table = rt->rt6i_table;
951         write_lock_bh(&table->tb6_lock);
952         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
953         write_unlock_bh(&table->tb6_lock);
954
955         return err;
956 }
957
958 int ip6_ins_rt(struct rt6_info *rt)
959 {
960         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
961         struct mx6_config mxc = { .mx = NULL, };
962
963         /* Hold dst to account for the reference from the fib6 tree */
964         dst_hold(&rt->dst);
965         return __ip6_ins_rt(rt, &info, &mxc, NULL);
966 }
967
968 /* called with rcu_lock held */
969 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
970 {
971         struct net_device *dev = rt->dst.dev;
972
973         if (rt->rt6i_flags & RTF_LOCAL) {
974                 /* for copies of local routes, dst->dev needs to be the
975                  * device if it is a master device, the master device if
976                  * device is enslaved, and the loopback as the default
977                  */
978                 if (netif_is_l3_slave(dev) &&
979                     !rt6_need_strict(&rt->rt6i_dst.addr))
980                         dev = l3mdev_master_dev_rcu(dev);
981                 else if (!netif_is_l3_master(dev))
982                         dev = dev_net(dev)->loopback_dev;
983                 /* last case is netif_is_l3_master(dev) is true in which
984                  * case we want dev returned to be dev
985                  */
986         }
987
988         return dev;
989 }
990
991 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
992                                            const struct in6_addr *daddr,
993                                            const struct in6_addr *saddr)
994 {
995         struct net_device *dev;
996         struct rt6_info *rt;
997
998         /*
999          *      Clone the route.
1000          */
1001
1002         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1003                 ort = (struct rt6_info *)ort->dst.from;
1004
1005         rcu_read_lock();
1006         dev = ip6_rt_get_dev_rcu(ort);
1007         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1008         rcu_read_unlock();
1009         if (!rt)
1010                 return NULL;
1011
1012         ip6_rt_copy_init(rt, ort);
1013         rt->rt6i_flags |= RTF_CACHE;
1014         rt->rt6i_metric = 0;
1015         rt->dst.flags |= DST_HOST;
1016         rt->rt6i_dst.addr = *daddr;
1017         rt->rt6i_dst.plen = 128;
1018
1019         if (!rt6_is_gw_or_nonexthop(ort)) {
1020                 if (ort->rt6i_dst.plen != 128 &&
1021                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1022                         rt->rt6i_flags |= RTF_ANYCAST;
1023 #ifdef CONFIG_IPV6_SUBTREES
1024                 if (rt->rt6i_src.plen && saddr) {
1025                         rt->rt6i_src.addr = *saddr;
1026                         rt->rt6i_src.plen = 128;
1027                 }
1028 #endif
1029         }
1030
1031         return rt;
1032 }
1033
1034 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1035 {
1036         struct net_device *dev;
1037         struct rt6_info *pcpu_rt;
1038
1039         rcu_read_lock();
1040         dev = ip6_rt_get_dev_rcu(rt);
1041         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1042         rcu_read_unlock();
1043         if (!pcpu_rt)
1044                 return NULL;
1045         ip6_rt_copy_init(pcpu_rt, rt);
1046         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1047         pcpu_rt->rt6i_flags |= RTF_PCPU;
1048         return pcpu_rt;
1049 }
1050
1051 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1052 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1053 {
1054         struct rt6_info *pcpu_rt, **p;
1055
1056         p = this_cpu_ptr(rt->rt6i_pcpu);
1057         pcpu_rt = *p;
1058
1059         if (pcpu_rt) {
1060                 dst_hold(&pcpu_rt->dst);
1061                 rt6_dst_from_metrics_check(pcpu_rt);
1062         }
1063         return pcpu_rt;
1064 }
1065
1066 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1067 {
1068         struct fib6_table *table = rt->rt6i_table;
1069         struct rt6_info *pcpu_rt, *prev, **p;
1070
1071         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1072         if (!pcpu_rt) {
1073                 struct net *net = dev_net(rt->dst.dev);
1074
1075                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1076                 return net->ipv6.ip6_null_entry;
1077         }
1078
1079         read_lock_bh(&table->tb6_lock);
1080         if (rt->rt6i_pcpu) {
1081                 p = this_cpu_ptr(rt->rt6i_pcpu);
1082                 prev = cmpxchg(p, NULL, pcpu_rt);
1083                 if (prev) {
1084                         /* If someone did it before us, return prev instead */
1085                         dst_release_immediate(&pcpu_rt->dst);
1086                         pcpu_rt = prev;
1087                 }
1088         } else {
1089                 /* rt has been removed from the fib6 tree
1090                  * before we have a chance to acquire the read_lock.
1091                  * In this case, don't brother to create a pcpu rt
1092                  * since rt is going away anyway.  The next
1093                  * dst_check() will trigger a re-lookup.
1094                  */
1095                 dst_release_immediate(&pcpu_rt->dst);
1096                 pcpu_rt = rt;
1097         }
1098         dst_hold(&pcpu_rt->dst);
1099         rt6_dst_from_metrics_check(pcpu_rt);
1100         read_unlock_bh(&table->tb6_lock);
1101         return pcpu_rt;
1102 }
1103
1104 /* exception hash table implementation
1105  */
1106 static DEFINE_SPINLOCK(rt6_exception_lock);
1107
1108 /* Remove rt6_ex from hash table and free the memory
1109  * Caller must hold rt6_exception_lock
1110  */
1111 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1112                                  struct rt6_exception *rt6_ex)
1113 {
1114         if (!bucket || !rt6_ex)
1115                 return;
1116         rt6_ex->rt6i->rt6i_node = NULL;
1117         hlist_del_rcu(&rt6_ex->hlist);
1118         rt6_release(rt6_ex->rt6i);
1119         kfree_rcu(rt6_ex, rcu);
1120         WARN_ON_ONCE(!bucket->depth);
1121         bucket->depth--;
1122 }
1123
1124 /* Remove oldest rt6_ex in bucket and free the memory
1125  * Caller must hold rt6_exception_lock
1126  */
1127 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1128 {
1129         struct rt6_exception *rt6_ex, *oldest = NULL;
1130
1131         if (!bucket)
1132                 return;
1133
1134         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1135                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1136                         oldest = rt6_ex;
1137         }
1138         rt6_remove_exception(bucket, oldest);
1139 }
1140
1141 static u32 rt6_exception_hash(const struct in6_addr *dst,
1142                               const struct in6_addr *src)
1143 {
1144         static u32 seed __read_mostly;
1145         u32 val;
1146
1147         net_get_random_once(&seed, sizeof(seed));
1148         val = jhash(dst, sizeof(*dst), seed);
1149
1150 #ifdef CONFIG_IPV6_SUBTREES
1151         if (src)
1152                 val = jhash(src, sizeof(*src), val);
1153 #endif
1154         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1155 }
1156
1157 /* Helper function to find the cached rt in the hash table
1158  * and update bucket pointer to point to the bucket for this
1159  * (daddr, saddr) pair
1160  * Caller must hold rt6_exception_lock
1161  */
1162 static struct rt6_exception *
1163 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1164                               const struct in6_addr *daddr,
1165                               const struct in6_addr *saddr)
1166 {
1167         struct rt6_exception *rt6_ex;
1168         u32 hval;
1169
1170         if (!(*bucket) || !daddr)
1171                 return NULL;
1172
1173         hval = rt6_exception_hash(daddr, saddr);
1174         *bucket += hval;
1175
1176         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1177                 struct rt6_info *rt6 = rt6_ex->rt6i;
1178                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1179
1180 #ifdef CONFIG_IPV6_SUBTREES
1181                 if (matched && saddr)
1182                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1183 #endif
1184                 if (matched)
1185                         return rt6_ex;
1186         }
1187         return NULL;
1188 }
1189
1190 /* Helper function to find the cached rt in the hash table
1191  * and update bucket pointer to point to the bucket for this
1192  * (daddr, saddr) pair
1193  * Caller must hold rcu_read_lock()
1194  */
1195 static struct rt6_exception *
1196 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1197                          const struct in6_addr *daddr,
1198                          const struct in6_addr *saddr)
1199 {
1200         struct rt6_exception *rt6_ex;
1201         u32 hval;
1202
1203         WARN_ON_ONCE(!rcu_read_lock_held());
1204
1205         if (!(*bucket) || !daddr)
1206                 return NULL;
1207
1208         hval = rt6_exception_hash(daddr, saddr);
1209         *bucket += hval;
1210
1211         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1212                 struct rt6_info *rt6 = rt6_ex->rt6i;
1213                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1214
1215 #ifdef CONFIG_IPV6_SUBTREES
1216                 if (matched && saddr)
1217                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1218 #endif
1219                 if (matched)
1220                         return rt6_ex;
1221         }
1222         return NULL;
1223 }
1224
1225 static int rt6_insert_exception(struct rt6_info *nrt,
1226                                 struct rt6_info *ort)
1227 {
1228         struct rt6_exception_bucket *bucket;
1229         struct in6_addr *src_key = NULL;
1230         struct rt6_exception *rt6_ex;
1231         int err = 0;
1232
1233         /* ort can't be a cache or pcpu route */
1234         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1235                 ort = (struct rt6_info *)ort->dst.from;
1236         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1237
1238         spin_lock_bh(&rt6_exception_lock);
1239
1240         if (ort->exception_bucket_flushed) {
1241                 err = -EINVAL;
1242                 goto out;
1243         }
1244
1245         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1246                                         lockdep_is_held(&rt6_exception_lock));
1247         if (!bucket) {
1248                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1249                                  GFP_ATOMIC);
1250                 if (!bucket) {
1251                         err = -ENOMEM;
1252                         goto out;
1253                 }
1254                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1255         }
1256
1257 #ifdef CONFIG_IPV6_SUBTREES
1258         /* rt6i_src.plen != 0 indicates ort is in subtree
1259          * and exception table is indexed by a hash of
1260          * both rt6i_dst and rt6i_src.
1261          * Otherwise, the exception table is indexed by
1262          * a hash of only rt6i_dst.
1263          */
1264         if (ort->rt6i_src.plen)
1265                 src_key = &nrt->rt6i_src.addr;
1266 #endif
1267         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1268                                                src_key);
1269         if (rt6_ex)
1270                 rt6_remove_exception(bucket, rt6_ex);
1271
1272         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1273         if (!rt6_ex) {
1274                 err = -ENOMEM;
1275                 goto out;
1276         }
1277         rt6_ex->rt6i = nrt;
1278         rt6_ex->stamp = jiffies;
1279         atomic_inc(&nrt->rt6i_ref);
1280         nrt->rt6i_node = ort->rt6i_node;
1281         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1282         bucket->depth++;
1283
1284         if (bucket->depth > FIB6_MAX_DEPTH)
1285                 rt6_exception_remove_oldest(bucket);
1286
1287 out:
1288         spin_unlock_bh(&rt6_exception_lock);
1289
1290         /* Update fn->fn_sernum to invalidate all cached dst */
1291         if (!err)
1292                 fib6_update_sernum(ort);
1293
1294         return err;
1295 }
1296
1297 void rt6_flush_exceptions(struct rt6_info *rt)
1298 {
1299         struct rt6_exception_bucket *bucket;
1300         struct rt6_exception *rt6_ex;
1301         struct hlist_node *tmp;
1302         int i;
1303
1304         spin_lock_bh(&rt6_exception_lock);
1305         /* Prevent rt6_insert_exception() to recreate the bucket list */
1306         rt->exception_bucket_flushed = 1;
1307
1308         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1309                                     lockdep_is_held(&rt6_exception_lock));
1310         if (!bucket)
1311                 goto out;
1312
1313         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1314                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1315                         rt6_remove_exception(bucket, rt6_ex);
1316                 WARN_ON_ONCE(bucket->depth);
1317                 bucket++;
1318         }
1319
1320 out:
1321         spin_unlock_bh(&rt6_exception_lock);
1322 }
1323
1324 /* Find cached rt in the hash table inside passed in rt
1325  * Caller has to hold rcu_read_lock()
1326  */
1327 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1328                                            struct in6_addr *daddr,
1329                                            struct in6_addr *saddr)
1330 {
1331         struct rt6_exception_bucket *bucket;
1332         struct in6_addr *src_key = NULL;
1333         struct rt6_exception *rt6_ex;
1334         struct rt6_info *res = NULL;
1335
1336         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1337
1338 #ifdef CONFIG_IPV6_SUBTREES
1339         /* rt6i_src.plen != 0 indicates rt is in subtree
1340          * and exception table is indexed by a hash of
1341          * both rt6i_dst and rt6i_src.
1342          * Otherwise, the exception table is indexed by
1343          * a hash of only rt6i_dst.
1344          */
1345         if (rt->rt6i_src.plen)
1346                 src_key = saddr;
1347 #endif
1348         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1349
1350         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1351                 res = rt6_ex->rt6i;
1352
1353         return res;
1354 }
1355
1356 /* Remove the passed in cached rt from the hash table that contains it */
1357 int rt6_remove_exception_rt(struct rt6_info *rt)
1358 {
1359         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1360         struct rt6_exception_bucket *bucket;
1361         struct in6_addr *src_key = NULL;
1362         struct rt6_exception *rt6_ex;
1363         int err;
1364
1365         if (!from ||
1366             !(rt->rt6i_flags | RTF_CACHE))
1367                 return -EINVAL;
1368
1369         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1370                 return -ENOENT;
1371
1372         spin_lock_bh(&rt6_exception_lock);
1373         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1374                                     lockdep_is_held(&rt6_exception_lock));
1375 #ifdef CONFIG_IPV6_SUBTREES
1376         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1377          * and exception table is indexed by a hash of
1378          * both rt6i_dst and rt6i_src.
1379          * Otherwise, the exception table is indexed by
1380          * a hash of only rt6i_dst.
1381          */
1382         if (from->rt6i_src.plen)
1383                 src_key = &rt->rt6i_src.addr;
1384 #endif
1385         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1386                                                &rt->rt6i_dst.addr,
1387                                                src_key);
1388         if (rt6_ex) {
1389                 rt6_remove_exception(bucket, rt6_ex);
1390                 err = 0;
1391         } else {
1392                 err = -ENOENT;
1393         }
1394
1395         spin_unlock_bh(&rt6_exception_lock);
1396         return err;
1397 }
1398
1399 /* Find rt6_ex which contains the passed in rt cache and
1400  * refresh its stamp
1401  */
1402 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1403 {
1404         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1405         struct rt6_exception_bucket *bucket;
1406         struct in6_addr *src_key = NULL;
1407         struct rt6_exception *rt6_ex;
1408
1409         if (!from ||
1410             !(rt->rt6i_flags | RTF_CACHE))
1411                 return;
1412
1413         rcu_read_lock();
1414         bucket = rcu_dereference(from->rt6i_exception_bucket);
1415
1416 #ifdef CONFIG_IPV6_SUBTREES
1417         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1418          * and exception table is indexed by a hash of
1419          * both rt6i_dst and rt6i_src.
1420          * Otherwise, the exception table is indexed by
1421          * a hash of only rt6i_dst.
1422          */
1423         if (from->rt6i_src.plen)
1424                 src_key = &rt->rt6i_src.addr;
1425 #endif
1426         rt6_ex = __rt6_find_exception_rcu(&bucket,
1427                                           &rt->rt6i_dst.addr,
1428                                           src_key);
1429         if (rt6_ex)
1430                 rt6_ex->stamp = jiffies;
1431
1432         rcu_read_unlock();
1433 }
1434
1435 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1436                                int oif, struct flowi6 *fl6, int flags)
1437 {
1438         struct fib6_node *fn, *saved_fn;
1439         struct rt6_info *rt;
1440         int strict = 0;
1441
1442         strict |= flags & RT6_LOOKUP_F_IFACE;
1443         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1444         if (net->ipv6.devconf_all->forwarding == 0)
1445                 strict |= RT6_LOOKUP_F_REACHABLE;
1446
1447         read_lock_bh(&table->tb6_lock);
1448
1449         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1450         saved_fn = fn;
1451
1452         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1453                 oif = 0;
1454
1455 redo_rt6_select:
1456         rt = rt6_select(fn, oif, strict);
1457         if (rt->rt6i_nsiblings)
1458                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1459         if (rt == net->ipv6.ip6_null_entry) {
1460                 fn = fib6_backtrack(fn, &fl6->saddr);
1461                 if (fn)
1462                         goto redo_rt6_select;
1463                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1464                         /* also consider unreachable route */
1465                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1466                         fn = saved_fn;
1467                         goto redo_rt6_select;
1468                 }
1469         }
1470
1471
1472         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1473                 dst_use(&rt->dst, jiffies);
1474                 read_unlock_bh(&table->tb6_lock);
1475
1476                 rt6_dst_from_metrics_check(rt);
1477
1478                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1479                 return rt;
1480         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1481                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1482                 /* Create a RTF_CACHE clone which will not be
1483                  * owned by the fib6 tree.  It is for the special case where
1484                  * the daddr in the skb during the neighbor look-up is different
1485                  * from the fl6->daddr used to look-up route here.
1486                  */
1487
1488                 struct rt6_info *uncached_rt;
1489
1490                 dst_use(&rt->dst, jiffies);
1491                 read_unlock_bh(&table->tb6_lock);
1492
1493                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1494                 dst_release(&rt->dst);
1495
1496                 if (uncached_rt) {
1497                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1498                          * No need for another dst_hold()
1499                          */
1500                         rt6_uncached_list_add(uncached_rt);
1501                 } else {
1502                         uncached_rt = net->ipv6.ip6_null_entry;
1503                         dst_hold(&uncached_rt->dst);
1504                 }
1505
1506                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1507                 return uncached_rt;
1508
1509         } else {
1510                 /* Get a percpu copy */
1511
1512                 struct rt6_info *pcpu_rt;
1513
1514                 rt->dst.lastuse = jiffies;
1515                 rt->dst.__use++;
1516                 pcpu_rt = rt6_get_pcpu_route(rt);
1517
1518                 if (pcpu_rt) {
1519                         read_unlock_bh(&table->tb6_lock);
1520                 } else {
1521                         /* We have to do the read_unlock first
1522                          * because rt6_make_pcpu_route() may trigger
1523                          * ip6_dst_gc() which will take the write_lock.
1524                          */
1525                         dst_hold(&rt->dst);
1526                         read_unlock_bh(&table->tb6_lock);
1527                         pcpu_rt = rt6_make_pcpu_route(rt);
1528                         dst_release(&rt->dst);
1529                 }
1530
1531                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1532                 return pcpu_rt;
1533
1534         }
1535 }
1536 EXPORT_SYMBOL_GPL(ip6_pol_route);
1537
1538 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1539                                             struct flowi6 *fl6, int flags)
1540 {
1541         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1542 }
1543
1544 struct dst_entry *ip6_route_input_lookup(struct net *net,
1545                                          struct net_device *dev,
1546                                          struct flowi6 *fl6, int flags)
1547 {
1548         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1549                 flags |= RT6_LOOKUP_F_IFACE;
1550
1551         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1552 }
1553 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1554
1555 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1556                                   struct flow_keys *keys)
1557 {
1558         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1559         const struct ipv6hdr *key_iph = outer_iph;
1560         const struct ipv6hdr *inner_iph;
1561         const struct icmp6hdr *icmph;
1562         struct ipv6hdr _inner_iph;
1563
1564         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1565                 goto out;
1566
1567         icmph = icmp6_hdr(skb);
1568         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1569             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1570             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1571             icmph->icmp6_type != ICMPV6_PARAMPROB)
1572                 goto out;
1573
1574         inner_iph = skb_header_pointer(skb,
1575                                        skb_transport_offset(skb) + sizeof(*icmph),
1576                                        sizeof(_inner_iph), &_inner_iph);
1577         if (!inner_iph)
1578                 goto out;
1579
1580         key_iph = inner_iph;
1581 out:
1582         memset(keys, 0, sizeof(*keys));
1583         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1584         keys->addrs.v6addrs.src = key_iph->saddr;
1585         keys->addrs.v6addrs.dst = key_iph->daddr;
1586         keys->tags.flow_label = ip6_flowinfo(key_iph);
1587         keys->basic.ip_proto = key_iph->nexthdr;
1588 }
1589
1590 /* if skb is set it will be used and fl6 can be NULL */
1591 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1592 {
1593         struct flow_keys hash_keys;
1594
1595         if (skb) {
1596                 ip6_multipath_l3_keys(skb, &hash_keys);
1597                 return flow_hash_from_keys(&hash_keys);
1598         }
1599
1600         return get_hash_from_flowi6(fl6);
1601 }
1602
1603 void ip6_route_input(struct sk_buff *skb)
1604 {
1605         const struct ipv6hdr *iph = ipv6_hdr(skb);
1606         struct net *net = dev_net(skb->dev);
1607         int flags = RT6_LOOKUP_F_HAS_SADDR;
1608         struct ip_tunnel_info *tun_info;
1609         struct flowi6 fl6 = {
1610                 .flowi6_iif = skb->dev->ifindex,
1611                 .daddr = iph->daddr,
1612                 .saddr = iph->saddr,
1613                 .flowlabel = ip6_flowinfo(iph),
1614                 .flowi6_mark = skb->mark,
1615                 .flowi6_proto = iph->nexthdr,
1616         };
1617
1618         tun_info = skb_tunnel_info(skb);
1619         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1620                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1621         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1622                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1623         skb_dst_drop(skb);
1624         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1625 }
1626
1627 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1628                                              struct flowi6 *fl6, int flags)
1629 {
1630         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1631 }
1632
1633 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1634                                          struct flowi6 *fl6, int flags)
1635 {
1636         bool any_src;
1637
1638         if (rt6_need_strict(&fl6->daddr)) {
1639                 struct dst_entry *dst;
1640
1641                 dst = l3mdev_link_scope_lookup(net, fl6);
1642                 if (dst)
1643                         return dst;
1644         }
1645
1646         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1647
1648         any_src = ipv6_addr_any(&fl6->saddr);
1649         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1650             (fl6->flowi6_oif && any_src))
1651                 flags |= RT6_LOOKUP_F_IFACE;
1652
1653         if (!any_src)
1654                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1655         else if (sk)
1656                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1657
1658         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1659 }
1660 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1661
1662 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1663 {
1664         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1665         struct net_device *loopback_dev = net->loopback_dev;
1666         struct dst_entry *new = NULL;
1667
1668         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1669                        DST_OBSOLETE_NONE, 0);
1670         if (rt) {
1671                 rt6_info_init(rt);
1672
1673                 new = &rt->dst;
1674                 new->__use = 1;
1675                 new->input = dst_discard;
1676                 new->output = dst_discard_out;
1677
1678                 dst_copy_metrics(new, &ort->dst);
1679
1680                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1681                 rt->rt6i_gateway = ort->rt6i_gateway;
1682                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1683                 rt->rt6i_metric = 0;
1684
1685                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1686 #ifdef CONFIG_IPV6_SUBTREES
1687                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1688 #endif
1689         }
1690
1691         dst_release(dst_orig);
1692         return new ? new : ERR_PTR(-ENOMEM);
1693 }
1694
1695 /*
1696  *      Destination cache support functions
1697  */
1698
1699 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1700 {
1701         if (rt->dst.from &&
1702             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1703                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1704 }
1705
1706 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1707 {
1708         u32 rt_cookie = 0;
1709
1710         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1711                 return NULL;
1712
1713         if (rt6_check_expired(rt))
1714                 return NULL;
1715
1716         return &rt->dst;
1717 }
1718
1719 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1720 {
1721         if (!__rt6_check_expired(rt) &&
1722             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1723             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1724                 return &rt->dst;
1725         else
1726                 return NULL;
1727 }
1728
1729 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1730 {
1731         struct rt6_info *rt;
1732
1733         rt = (struct rt6_info *) dst;
1734
1735         /* All IPV6 dsts are created with ->obsolete set to the value
1736          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1737          * into this function always.
1738          */
1739
1740         rt6_dst_from_metrics_check(rt);
1741
1742         if (rt->rt6i_flags & RTF_PCPU ||
1743             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1744                 return rt6_dst_from_check(rt, cookie);
1745         else
1746                 return rt6_check(rt, cookie);
1747 }
1748
1749 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1750 {
1751         struct rt6_info *rt = (struct rt6_info *) dst;
1752
1753         if (rt) {
1754                 if (rt->rt6i_flags & RTF_CACHE) {
1755                         if (rt6_check_expired(rt)) {
1756                                 ip6_del_rt(rt);
1757                                 dst = NULL;
1758                         }
1759                 } else {
1760                         dst_release(dst);
1761                         dst = NULL;
1762                 }
1763         }
1764         return dst;
1765 }
1766
1767 static void ip6_link_failure(struct sk_buff *skb)
1768 {
1769         struct rt6_info *rt;
1770
1771         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1772
1773         rt = (struct rt6_info *) skb_dst(skb);
1774         if (rt) {
1775                 if (rt->rt6i_flags & RTF_CACHE) {
1776                         if (dst_hold_safe(&rt->dst))
1777                                 ip6_del_rt(rt);
1778                 } else {
1779                         struct fib6_node *fn;
1780
1781                         rcu_read_lock();
1782                         fn = rcu_dereference(rt->rt6i_node);
1783                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1784                                 fn->fn_sernum = -1;
1785                         rcu_read_unlock();
1786                 }
1787         }
1788 }
1789
1790 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1791 {
1792         struct net *net = dev_net(rt->dst.dev);
1793
1794         rt->rt6i_flags |= RTF_MODIFIED;
1795         rt->rt6i_pmtu = mtu;
1796         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1797 }
1798
1799 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1800 {
1801         return !(rt->rt6i_flags & RTF_CACHE) &&
1802                 (rt->rt6i_flags & RTF_PCPU ||
1803                  rcu_access_pointer(rt->rt6i_node));
1804 }
1805
1806 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1807                                  const struct ipv6hdr *iph, u32 mtu)
1808 {
1809         const struct in6_addr *daddr, *saddr;
1810         struct rt6_info *rt6 = (struct rt6_info *)dst;
1811
1812         if (rt6->rt6i_flags & RTF_LOCAL)
1813                 return;
1814
1815         if (dst_metric_locked(dst, RTAX_MTU))
1816                 return;
1817
1818         if (iph) {
1819                 daddr = &iph->daddr;
1820                 saddr = &iph->saddr;
1821         } else if (sk) {
1822                 daddr = &sk->sk_v6_daddr;
1823                 saddr = &inet6_sk(sk)->saddr;
1824         } else {
1825                 daddr = NULL;
1826                 saddr = NULL;
1827         }
1828         dst_confirm_neigh(dst, daddr);
1829         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1830         if (mtu >= dst_mtu(dst))
1831                 return;
1832
1833         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1834                 rt6_do_update_pmtu(rt6, mtu);
1835         } else if (daddr) {
1836                 struct rt6_info *nrt6;
1837
1838                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1839                 if (nrt6) {
1840                         rt6_do_update_pmtu(nrt6, mtu);
1841
1842                         /* ip6_ins_rt(nrt6) will bump the
1843                          * rt6->rt6i_node->fn_sernum
1844                          * which will fail the next rt6_check() and
1845                          * invalidate the sk->sk_dst_cache.
1846                          */
1847                         ip6_ins_rt(nrt6);
1848                         /* Release the reference taken in
1849                          * ip6_rt_cache_alloc()
1850                          */
1851                         dst_release(&nrt6->dst);
1852                 }
1853         }
1854 }
1855
1856 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1857                                struct sk_buff *skb, u32 mtu)
1858 {
1859         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1860 }
1861
1862 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1863                      int oif, u32 mark, kuid_t uid)
1864 {
1865         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1866         struct dst_entry *dst;
1867         struct flowi6 fl6;
1868
1869         memset(&fl6, 0, sizeof(fl6));
1870         fl6.flowi6_oif = oif;
1871         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1872         fl6.daddr = iph->daddr;
1873         fl6.saddr = iph->saddr;
1874         fl6.flowlabel = ip6_flowinfo(iph);
1875         fl6.flowi6_uid = uid;
1876
1877         dst = ip6_route_output(net, NULL, &fl6);
1878         if (!dst->error)
1879                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1880         dst_release(dst);
1881 }
1882 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1883
1884 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1885 {
1886         struct dst_entry *dst;
1887
1888         ip6_update_pmtu(skb, sock_net(sk), mtu,
1889                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1890
1891         dst = __sk_dst_get(sk);
1892         if (!dst || !dst->obsolete ||
1893             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1894                 return;
1895
1896         bh_lock_sock(sk);
1897         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1898                 ip6_datagram_dst_update(sk, false);
1899         bh_unlock_sock(sk);
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1902
1903 /* Handle redirects */
1904 struct ip6rd_flowi {
1905         struct flowi6 fl6;
1906         struct in6_addr gateway;
1907 };
1908
1909 static struct rt6_info *__ip6_route_redirect(struct net *net,
1910                                              struct fib6_table *table,
1911                                              struct flowi6 *fl6,
1912                                              int flags)
1913 {
1914         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1915         struct rt6_info *rt;
1916         struct fib6_node *fn;
1917
1918         /* Get the "current" route for this destination and
1919          * check if the redirect has come from appropriate router.
1920          *
1921          * RFC 4861 specifies that redirects should only be
1922          * accepted if they come from the nexthop to the target.
1923          * Due to the way the routes are chosen, this notion
1924          * is a bit fuzzy and one might need to check all possible
1925          * routes.
1926          */
1927
1928         read_lock_bh(&table->tb6_lock);
1929         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1930 restart:
1931         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1932                 if (rt6_check_expired(rt))
1933                         continue;
1934                 if (rt->dst.error)
1935                         break;
1936                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1937                         continue;
1938                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1939                         continue;
1940                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1941                         continue;
1942                 break;
1943         }
1944
1945         if (!rt)
1946                 rt = net->ipv6.ip6_null_entry;
1947         else if (rt->dst.error) {
1948                 rt = net->ipv6.ip6_null_entry;
1949                 goto out;
1950         }
1951
1952         if (rt == net->ipv6.ip6_null_entry) {
1953                 fn = fib6_backtrack(fn, &fl6->saddr);
1954                 if (fn)
1955                         goto restart;
1956         }
1957
1958 out:
1959         dst_hold(&rt->dst);
1960
1961         read_unlock_bh(&table->tb6_lock);
1962
1963         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1964         return rt;
1965 };
1966
1967 static struct dst_entry *ip6_route_redirect(struct net *net,
1968                                         const struct flowi6 *fl6,
1969                                         const struct in6_addr *gateway)
1970 {
1971         int flags = RT6_LOOKUP_F_HAS_SADDR;
1972         struct ip6rd_flowi rdfl;
1973
1974         rdfl.fl6 = *fl6;
1975         rdfl.gateway = *gateway;
1976
1977         return fib6_rule_lookup(net, &rdfl.fl6,
1978                                 flags, __ip6_route_redirect);
1979 }
1980
1981 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1982                   kuid_t uid)
1983 {
1984         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1985         struct dst_entry *dst;
1986         struct flowi6 fl6;
1987
1988         memset(&fl6, 0, sizeof(fl6));
1989         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1990         fl6.flowi6_oif = oif;
1991         fl6.flowi6_mark = mark;
1992         fl6.daddr = iph->daddr;
1993         fl6.saddr = iph->saddr;
1994         fl6.flowlabel = ip6_flowinfo(iph);
1995         fl6.flowi6_uid = uid;
1996
1997         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1998         rt6_do_redirect(dst, NULL, skb);
1999         dst_release(dst);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_redirect);
2002
2003 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2004                             u32 mark)
2005 {
2006         const struct ipv6hdr *iph = ipv6_hdr(skb);
2007         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2008         struct dst_entry *dst;
2009         struct flowi6 fl6;
2010
2011         memset(&fl6, 0, sizeof(fl6));
2012         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2013         fl6.flowi6_oif = oif;
2014         fl6.flowi6_mark = mark;
2015         fl6.daddr = msg->dest;
2016         fl6.saddr = iph->daddr;
2017         fl6.flowi6_uid = sock_net_uid(net, NULL);
2018
2019         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2020         rt6_do_redirect(dst, NULL, skb);
2021         dst_release(dst);
2022 }
2023
2024 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2025 {
2026         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2027                      sk->sk_uid);
2028 }
2029 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2030
2031 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2032 {
2033         struct net_device *dev = dst->dev;
2034         unsigned int mtu = dst_mtu(dst);
2035         struct net *net = dev_net(dev);
2036
2037         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2038
2039         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2040                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2041
2042         /*
2043          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2044          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2045          * IPV6_MAXPLEN is also valid and means: "any MSS,
2046          * rely only on pmtu discovery"
2047          */
2048         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2049                 mtu = IPV6_MAXPLEN;
2050         return mtu;
2051 }
2052
2053 static unsigned int ip6_mtu(const struct dst_entry *dst)
2054 {
2055         const struct rt6_info *rt = (const struct rt6_info *)dst;
2056         unsigned int mtu = rt->rt6i_pmtu;
2057         struct inet6_dev *idev;
2058
2059         if (mtu)
2060                 goto out;
2061
2062         mtu = dst_metric_raw(dst, RTAX_MTU);
2063         if (mtu)
2064                 goto out;
2065
2066         mtu = IPV6_MIN_MTU;
2067
2068         rcu_read_lock();
2069         idev = __in6_dev_get(dst->dev);
2070         if (idev)
2071                 mtu = idev->cnf.mtu6;
2072         rcu_read_unlock();
2073
2074 out:
2075         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2076
2077         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2078 }
2079
2080 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2081                                   struct flowi6 *fl6)
2082 {
2083         struct dst_entry *dst;
2084         struct rt6_info *rt;
2085         struct inet6_dev *idev = in6_dev_get(dev);
2086         struct net *net = dev_net(dev);
2087
2088         if (unlikely(!idev))
2089                 return ERR_PTR(-ENODEV);
2090
2091         rt = ip6_dst_alloc(net, dev, 0);
2092         if (unlikely(!rt)) {
2093                 in6_dev_put(idev);
2094                 dst = ERR_PTR(-ENOMEM);
2095                 goto out;
2096         }
2097
2098         rt->dst.flags |= DST_HOST;
2099         rt->dst.output  = ip6_output;
2100         rt->rt6i_gateway  = fl6->daddr;
2101         rt->rt6i_dst.addr = fl6->daddr;
2102         rt->rt6i_dst.plen = 128;
2103         rt->rt6i_idev     = idev;
2104         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2105
2106         /* Add this dst into uncached_list so that rt6_ifdown() can
2107          * do proper release of the net_device
2108          */
2109         rt6_uncached_list_add(rt);
2110
2111         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2112
2113 out:
2114         return dst;
2115 }
2116
2117 static int ip6_dst_gc(struct dst_ops *ops)
2118 {
2119         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2120         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2121         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2122         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2123         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2124         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2125         int entries;
2126
2127         entries = dst_entries_get_fast(ops);
2128         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2129             entries <= rt_max_size)
2130                 goto out;
2131
2132         net->ipv6.ip6_rt_gc_expire++;
2133         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2134         entries = dst_entries_get_slow(ops);
2135         if (entries < ops->gc_thresh)
2136                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2137 out:
2138         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2139         return entries > rt_max_size;
2140 }
2141
2142 static int ip6_convert_metrics(struct mx6_config *mxc,
2143                                const struct fib6_config *cfg)
2144 {
2145         bool ecn_ca = false;
2146         struct nlattr *nla;
2147         int remaining;
2148         u32 *mp;
2149
2150         if (!cfg->fc_mx)
2151                 return 0;
2152
2153         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2154         if (unlikely(!mp))
2155                 return -ENOMEM;
2156
2157         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2158                 int type = nla_type(nla);
2159                 u32 val;
2160
2161                 if (!type)
2162                         continue;
2163                 if (unlikely(type > RTAX_MAX))
2164                         goto err;
2165
2166                 if (type == RTAX_CC_ALGO) {
2167                         char tmp[TCP_CA_NAME_MAX];
2168
2169                         nla_strlcpy(tmp, nla, sizeof(tmp));
2170                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2171                         if (val == TCP_CA_UNSPEC)
2172                                 goto err;
2173                 } else {
2174                         val = nla_get_u32(nla);
2175                 }
2176                 if (type == RTAX_HOPLIMIT && val > 255)
2177                         val = 255;
2178                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2179                         goto err;
2180
2181                 mp[type - 1] = val;
2182                 __set_bit(type - 1, mxc->mx_valid);
2183         }
2184
2185         if (ecn_ca) {
2186                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2187                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2188         }
2189
2190         mxc->mx = mp;
2191         return 0;
2192  err:
2193         kfree(mp);
2194         return -EINVAL;
2195 }
2196
2197 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2198                                             struct fib6_config *cfg,
2199                                             const struct in6_addr *gw_addr)
2200 {
2201         struct flowi6 fl6 = {
2202                 .flowi6_oif = cfg->fc_ifindex,
2203                 .daddr = *gw_addr,
2204                 .saddr = cfg->fc_prefsrc,
2205         };
2206         struct fib6_table *table;
2207         struct rt6_info *rt;
2208         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2209
2210         table = fib6_get_table(net, cfg->fc_table);
2211         if (!table)
2212                 return NULL;
2213
2214         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2215                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2216
2217         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2218
2219         /* if table lookup failed, fall back to full lookup */
2220         if (rt == net->ipv6.ip6_null_entry) {
2221                 ip6_rt_put(rt);
2222                 rt = NULL;
2223         }
2224
2225         return rt;
2226 }
2227
2228 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2229                                               struct netlink_ext_ack *extack)
2230 {
2231         struct net *net = cfg->fc_nlinfo.nl_net;
2232         struct rt6_info *rt = NULL;
2233         struct net_device *dev = NULL;
2234         struct inet6_dev *idev = NULL;
2235         struct fib6_table *table;
2236         int addr_type;
2237         int err = -EINVAL;
2238
2239         /* RTF_PCPU is an internal flag; can not be set by userspace */
2240         if (cfg->fc_flags & RTF_PCPU) {
2241                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2242                 goto out;
2243         }
2244
2245         if (cfg->fc_dst_len > 128) {
2246                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2247                 goto out;
2248         }
2249         if (cfg->fc_src_len > 128) {
2250                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2251                 goto out;
2252         }
2253 #ifndef CONFIG_IPV6_SUBTREES
2254         if (cfg->fc_src_len) {
2255                 NL_SET_ERR_MSG(extack,
2256                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2257                 goto out;
2258         }
2259 #endif
2260         if (cfg->fc_ifindex) {
2261                 err = -ENODEV;
2262                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2263                 if (!dev)
2264                         goto out;
2265                 idev = in6_dev_get(dev);
2266                 if (!idev)
2267                         goto out;
2268         }
2269
2270         if (cfg->fc_metric == 0)
2271                 cfg->fc_metric = IP6_RT_PRIO_USER;
2272
2273         err = -ENOBUFS;
2274         if (cfg->fc_nlinfo.nlh &&
2275             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2276                 table = fib6_get_table(net, cfg->fc_table);
2277                 if (!table) {
2278                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2279                         table = fib6_new_table(net, cfg->fc_table);
2280                 }
2281         } else {
2282                 table = fib6_new_table(net, cfg->fc_table);
2283         }
2284
2285         if (!table)
2286                 goto out;
2287
2288         rt = ip6_dst_alloc(net, NULL,
2289                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2290
2291         if (!rt) {
2292                 err = -ENOMEM;
2293                 goto out;
2294         }
2295
2296         if (cfg->fc_flags & RTF_EXPIRES)
2297                 rt6_set_expires(rt, jiffies +
2298                                 clock_t_to_jiffies(cfg->fc_expires));
2299         else
2300                 rt6_clean_expires(rt);
2301
2302         if (cfg->fc_protocol == RTPROT_UNSPEC)
2303                 cfg->fc_protocol = RTPROT_BOOT;
2304         rt->rt6i_protocol = cfg->fc_protocol;
2305
2306         addr_type = ipv6_addr_type(&cfg->fc_dst);
2307
2308         if (addr_type & IPV6_ADDR_MULTICAST)
2309                 rt->dst.input = ip6_mc_input;
2310         else if (cfg->fc_flags & RTF_LOCAL)
2311                 rt->dst.input = ip6_input;
2312         else
2313                 rt->dst.input = ip6_forward;
2314
2315         rt->dst.output = ip6_output;
2316
2317         if (cfg->fc_encap) {
2318                 struct lwtunnel_state *lwtstate;
2319
2320                 err = lwtunnel_build_state(cfg->fc_encap_type,
2321                                            cfg->fc_encap, AF_INET6, cfg,
2322                                            &lwtstate, extack);
2323                 if (err)
2324                         goto out;
2325                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2326                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2327                         rt->dst.lwtstate->orig_output = rt->dst.output;
2328                         rt->dst.output = lwtunnel_output;
2329                 }
2330                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2331                         rt->dst.lwtstate->orig_input = rt->dst.input;
2332                         rt->dst.input = lwtunnel_input;
2333                 }
2334         }
2335
2336         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2337         rt->rt6i_dst.plen = cfg->fc_dst_len;
2338         if (rt->rt6i_dst.plen == 128)
2339                 rt->dst.flags |= DST_HOST;
2340
2341 #ifdef CONFIG_IPV6_SUBTREES
2342         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2343         rt->rt6i_src.plen = cfg->fc_src_len;
2344 #endif
2345
2346         rt->rt6i_metric = cfg->fc_metric;
2347
2348         /* We cannot add true routes via loopback here,
2349            they would result in kernel looping; promote them to reject routes
2350          */
2351         if ((cfg->fc_flags & RTF_REJECT) ||
2352             (dev && (dev->flags & IFF_LOOPBACK) &&
2353              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2354              !(cfg->fc_flags & RTF_LOCAL))) {
2355                 /* hold loopback dev/idev if we haven't done so. */
2356                 if (dev != net->loopback_dev) {
2357                         if (dev) {
2358                                 dev_put(dev);
2359                                 in6_dev_put(idev);
2360                         }
2361                         dev = net->loopback_dev;
2362                         dev_hold(dev);
2363                         idev = in6_dev_get(dev);
2364                         if (!idev) {
2365                                 err = -ENODEV;
2366                                 goto out;
2367                         }
2368                 }
2369                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2370                 switch (cfg->fc_type) {
2371                 case RTN_BLACKHOLE:
2372                         rt->dst.error = -EINVAL;
2373                         rt->dst.output = dst_discard_out;
2374                         rt->dst.input = dst_discard;
2375                         break;
2376                 case RTN_PROHIBIT:
2377                         rt->dst.error = -EACCES;
2378                         rt->dst.output = ip6_pkt_prohibit_out;
2379                         rt->dst.input = ip6_pkt_prohibit;
2380                         break;
2381                 case RTN_THROW:
2382                 case RTN_UNREACHABLE:
2383                 default:
2384                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2385                                         : (cfg->fc_type == RTN_UNREACHABLE)
2386                                         ? -EHOSTUNREACH : -ENETUNREACH;
2387                         rt->dst.output = ip6_pkt_discard_out;
2388                         rt->dst.input = ip6_pkt_discard;
2389                         break;
2390                 }
2391                 goto install_route;
2392         }
2393
2394         if (cfg->fc_flags & RTF_GATEWAY) {
2395                 const struct in6_addr *gw_addr;
2396                 int gwa_type;
2397
2398                 gw_addr = &cfg->fc_gateway;
2399                 gwa_type = ipv6_addr_type(gw_addr);
2400
2401                 /* if gw_addr is local we will fail to detect this in case
2402                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2403                  * will return already-added prefix route via interface that
2404                  * prefix route was assigned to, which might be non-loopback.
2405                  */
2406                 err = -EINVAL;
2407                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2408                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2409                                             dev : NULL, 0, 0)) {
2410                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2411                         goto out;
2412                 }
2413                 rt->rt6i_gateway = *gw_addr;
2414
2415                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2416                         struct rt6_info *grt = NULL;
2417
2418                         /* IPv6 strictly inhibits using not link-local
2419                            addresses as nexthop address.
2420                            Otherwise, router will not able to send redirects.
2421                            It is very good, but in some (rare!) circumstances
2422                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2423                            some exceptions. --ANK
2424                            We allow IPv4-mapped nexthops to support RFC4798-type
2425                            addressing
2426                          */
2427                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2428                                           IPV6_ADDR_MAPPED))) {
2429                                 NL_SET_ERR_MSG(extack,
2430                                                "Invalid gateway address");
2431                                 goto out;
2432                         }
2433
2434                         if (cfg->fc_table) {
2435                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2436
2437                                 if (grt) {
2438                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2439                                             (dev && dev != grt->dst.dev)) {
2440                                                 ip6_rt_put(grt);
2441                                                 grt = NULL;
2442                                         }
2443                                 }
2444                         }
2445
2446                         if (!grt)
2447                                 grt = rt6_lookup(net, gw_addr, NULL,
2448                                                  cfg->fc_ifindex, 1);
2449
2450                         err = -EHOSTUNREACH;
2451                         if (!grt)
2452                                 goto out;
2453                         if (dev) {
2454                                 if (dev != grt->dst.dev) {
2455                                         ip6_rt_put(grt);
2456                                         goto out;
2457                                 }
2458                         } else {
2459                                 dev = grt->dst.dev;
2460                                 idev = grt->rt6i_idev;
2461                                 dev_hold(dev);
2462                                 in6_dev_hold(grt->rt6i_idev);
2463                         }
2464                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2465                                 err = 0;
2466                         ip6_rt_put(grt);
2467
2468                         if (err)
2469                                 goto out;
2470                 }
2471                 err = -EINVAL;
2472                 if (!dev) {
2473                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2474                         goto out;
2475                 } else if (dev->flags & IFF_LOOPBACK) {
2476                         NL_SET_ERR_MSG(extack,
2477                                        "Egress device can not be loopback device for this route");
2478                         goto out;
2479                 }
2480         }
2481
2482         err = -ENODEV;
2483         if (!dev)
2484                 goto out;
2485
2486         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2487                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2488                         NL_SET_ERR_MSG(extack, "Invalid source address");
2489                         err = -EINVAL;
2490                         goto out;
2491                 }
2492                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2493                 rt->rt6i_prefsrc.plen = 128;
2494         } else
2495                 rt->rt6i_prefsrc.plen = 0;
2496
2497         rt->rt6i_flags = cfg->fc_flags;
2498
2499 install_route:
2500         rt->dst.dev = dev;
2501         rt->rt6i_idev = idev;
2502         rt->rt6i_table = table;
2503
2504         cfg->fc_nlinfo.nl_net = dev_net(dev);
2505
2506         return rt;
2507 out:
2508         if (dev)
2509                 dev_put(dev);
2510         if (idev)
2511                 in6_dev_put(idev);
2512         if (rt)
2513                 dst_release_immediate(&rt->dst);
2514
2515         return ERR_PTR(err);
2516 }
2517
2518 int ip6_route_add(struct fib6_config *cfg,
2519                   struct netlink_ext_ack *extack)
2520 {
2521         struct mx6_config mxc = { .mx = NULL, };
2522         struct rt6_info *rt;
2523         int err;
2524
2525         rt = ip6_route_info_create(cfg, extack);
2526         if (IS_ERR(rt)) {
2527                 err = PTR_ERR(rt);
2528                 rt = NULL;
2529                 goto out;
2530         }
2531
2532         err = ip6_convert_metrics(&mxc, cfg);
2533         if (err)
2534                 goto out;
2535
2536         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2537
2538         kfree(mxc.mx);
2539
2540         return err;
2541 out:
2542         if (rt)
2543                 dst_release_immediate(&rt->dst);
2544
2545         return err;
2546 }
2547
2548 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2549 {
2550         int err;
2551         struct fib6_table *table;
2552         struct net *net = dev_net(rt->dst.dev);
2553
2554         if (rt == net->ipv6.ip6_null_entry) {
2555                 err = -ENOENT;
2556                 goto out;
2557         }
2558
2559         table = rt->rt6i_table;
2560         write_lock_bh(&table->tb6_lock);
2561         err = fib6_del(rt, info);
2562         write_unlock_bh(&table->tb6_lock);
2563
2564 out:
2565         ip6_rt_put(rt);
2566         return err;
2567 }
2568
2569 int ip6_del_rt(struct rt6_info *rt)
2570 {
2571         struct nl_info info = {
2572                 .nl_net = dev_net(rt->dst.dev),
2573         };
2574         return __ip6_del_rt(rt, &info);
2575 }
2576
2577 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2578 {
2579         struct nl_info *info = &cfg->fc_nlinfo;
2580         struct net *net = info->nl_net;
2581         struct sk_buff *skb = NULL;
2582         struct fib6_table *table;
2583         int err = -ENOENT;
2584
2585         if (rt == net->ipv6.ip6_null_entry)
2586                 goto out_put;
2587         table = rt->rt6i_table;
2588         write_lock_bh(&table->tb6_lock);
2589
2590         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2591                 struct rt6_info *sibling, *next_sibling;
2592
2593                 /* prefer to send a single notification with all hops */
2594                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2595                 if (skb) {
2596                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2597
2598                         if (rt6_fill_node(net, skb, rt,
2599                                           NULL, NULL, 0, RTM_DELROUTE,
2600                                           info->portid, seq, 0) < 0) {
2601                                 kfree_skb(skb);
2602                                 skb = NULL;
2603                         } else
2604                                 info->skip_notify = 1;
2605                 }
2606
2607                 list_for_each_entry_safe(sibling, next_sibling,
2608                                          &rt->rt6i_siblings,
2609                                          rt6i_siblings) {
2610                         err = fib6_del(sibling, info);
2611                         if (err)
2612                                 goto out_unlock;
2613                 }
2614         }
2615
2616         err = fib6_del(rt, info);
2617 out_unlock:
2618         write_unlock_bh(&table->tb6_lock);
2619 out_put:
2620         ip6_rt_put(rt);
2621
2622         if (skb) {
2623                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2624                             info->nlh, gfp_any());
2625         }
2626         return err;
2627 }
2628
2629 static int ip6_route_del(struct fib6_config *cfg,
2630                          struct netlink_ext_ack *extack)
2631 {
2632         struct fib6_table *table;
2633         struct fib6_node *fn;
2634         struct rt6_info *rt;
2635         int err = -ESRCH;
2636
2637         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2638         if (!table) {
2639                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2640                 return err;
2641         }
2642
2643         read_lock_bh(&table->tb6_lock);
2644
2645         fn = fib6_locate(&table->tb6_root,
2646                          &cfg->fc_dst, cfg->fc_dst_len,
2647                          &cfg->fc_src, cfg->fc_src_len);
2648
2649         if (fn) {
2650                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2651                         if ((rt->rt6i_flags & RTF_CACHE) &&
2652                             !(cfg->fc_flags & RTF_CACHE))
2653                                 continue;
2654                         if (cfg->fc_ifindex &&
2655                             (!rt->dst.dev ||
2656                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2657                                 continue;
2658                         if (cfg->fc_flags & RTF_GATEWAY &&
2659                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2660                                 continue;
2661                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2662                                 continue;
2663                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2664                                 continue;
2665                         dst_hold(&rt->dst);
2666                         read_unlock_bh(&table->tb6_lock);
2667
2668                         /* if gateway was specified only delete the one hop */
2669                         if (cfg->fc_flags & RTF_GATEWAY)
2670                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2671
2672                         return __ip6_del_rt_siblings(rt, cfg);
2673                 }
2674         }
2675         read_unlock_bh(&table->tb6_lock);
2676
2677         return err;
2678 }
2679
2680 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2681 {
2682         struct netevent_redirect netevent;
2683         struct rt6_info *rt, *nrt = NULL;
2684         struct ndisc_options ndopts;
2685         struct inet6_dev *in6_dev;
2686         struct neighbour *neigh;
2687         struct rd_msg *msg;
2688         int optlen, on_link;
2689         u8 *lladdr;
2690
2691         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2692         optlen -= sizeof(*msg);
2693
2694         if (optlen < 0) {
2695                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2696                 return;
2697         }
2698
2699         msg = (struct rd_msg *)icmp6_hdr(skb);
2700
2701         if (ipv6_addr_is_multicast(&msg->dest)) {
2702                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2703                 return;
2704         }
2705
2706         on_link = 0;
2707         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2708                 on_link = 1;
2709         } else if (ipv6_addr_type(&msg->target) !=
2710                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2711                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2712                 return;
2713         }
2714
2715         in6_dev = __in6_dev_get(skb->dev);
2716         if (!in6_dev)
2717                 return;
2718         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2719                 return;
2720
2721         /* RFC2461 8.1:
2722          *      The IP source address of the Redirect MUST be the same as the current
2723          *      first-hop router for the specified ICMP Destination Address.
2724          */
2725
2726         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2727                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2728                 return;
2729         }
2730
2731         lladdr = NULL;
2732         if (ndopts.nd_opts_tgt_lladdr) {
2733                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2734                                              skb->dev);
2735                 if (!lladdr) {
2736                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2737                         return;
2738                 }
2739         }
2740
2741         rt = (struct rt6_info *) dst;
2742         if (rt->rt6i_flags & RTF_REJECT) {
2743                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2744                 return;
2745         }
2746
2747         /* Redirect received -> path was valid.
2748          * Look, redirects are sent only in response to data packets,
2749          * so that this nexthop apparently is reachable. --ANK
2750          */
2751         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2752
2753         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2754         if (!neigh)
2755                 return;
2756
2757         /*
2758          *      We have finally decided to accept it.
2759          */
2760
2761         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2762                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2763                      NEIGH_UPDATE_F_OVERRIDE|
2764                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2765                                      NEIGH_UPDATE_F_ISROUTER)),
2766                      NDISC_REDIRECT, &ndopts);
2767
2768         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2769         if (!nrt)
2770                 goto out;
2771
2772         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2773         if (on_link)
2774                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2775
2776         nrt->rt6i_protocol = RTPROT_REDIRECT;
2777         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2778
2779         if (ip6_ins_rt(nrt))
2780                 goto out_release;
2781
2782         netevent.old = &rt->dst;
2783         netevent.new = &nrt->dst;
2784         netevent.daddr = &msg->dest;
2785         netevent.neigh = neigh;
2786         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2787
2788         if (rt->rt6i_flags & RTF_CACHE) {
2789                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2790                 ip6_del_rt(rt);
2791         }
2792
2793 out_release:
2794         /* Release the reference taken in
2795          * ip6_rt_cache_alloc()
2796          */
2797         dst_release(&nrt->dst);
2798
2799 out:
2800         neigh_release(neigh);
2801 }
2802
2803 /*
2804  *      Misc support functions
2805  */
2806
2807 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2808 {
2809         BUG_ON(from->dst.from);
2810
2811         rt->rt6i_flags &= ~RTF_EXPIRES;
2812         dst_hold(&from->dst);
2813         rt->dst.from = &from->dst;
2814         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2815 }
2816
2817 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2818 {
2819         rt->dst.input = ort->dst.input;
2820         rt->dst.output = ort->dst.output;
2821         rt->rt6i_dst = ort->rt6i_dst;
2822         rt->dst.error = ort->dst.error;
2823         rt->rt6i_idev = ort->rt6i_idev;
2824         if (rt->rt6i_idev)
2825                 in6_dev_hold(rt->rt6i_idev);
2826         rt->dst.lastuse = jiffies;
2827         rt->rt6i_gateway = ort->rt6i_gateway;
2828         rt->rt6i_flags = ort->rt6i_flags;
2829         rt6_set_from(rt, ort);
2830         rt->rt6i_metric = ort->rt6i_metric;
2831 #ifdef CONFIG_IPV6_SUBTREES
2832         rt->rt6i_src = ort->rt6i_src;
2833 #endif
2834         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2835         rt->rt6i_table = ort->rt6i_table;
2836         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2837 }
2838
2839 #ifdef CONFIG_IPV6_ROUTE_INFO
2840 static struct rt6_info *rt6_get_route_info(struct net *net,
2841                                            const struct in6_addr *prefix, int prefixlen,
2842                                            const struct in6_addr *gwaddr,
2843                                            struct net_device *dev)
2844 {
2845         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2846         int ifindex = dev->ifindex;
2847         struct fib6_node *fn;
2848         struct rt6_info *rt = NULL;
2849         struct fib6_table *table;
2850
2851         table = fib6_get_table(net, tb_id);
2852         if (!table)
2853                 return NULL;
2854
2855         read_lock_bh(&table->tb6_lock);
2856         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2857         if (!fn)
2858                 goto out;
2859
2860         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2861                 if (rt->dst.dev->ifindex != ifindex)
2862                         continue;
2863                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2864                         continue;
2865                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2866                         continue;
2867                 dst_hold(&rt->dst);
2868                 break;
2869         }
2870 out:
2871         read_unlock_bh(&table->tb6_lock);
2872         return rt;
2873 }
2874
2875 static struct rt6_info *rt6_add_route_info(struct net *net,
2876                                            const struct in6_addr *prefix, int prefixlen,
2877                                            const struct in6_addr *gwaddr,
2878                                            struct net_device *dev,
2879                                            unsigned int pref)
2880 {
2881         struct fib6_config cfg = {
2882                 .fc_metric      = IP6_RT_PRIO_USER,
2883                 .fc_ifindex     = dev->ifindex,
2884                 .fc_dst_len     = prefixlen,
2885                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2886                                   RTF_UP | RTF_PREF(pref),
2887                 .fc_protocol = RTPROT_RA,
2888                 .fc_nlinfo.portid = 0,
2889                 .fc_nlinfo.nlh = NULL,
2890                 .fc_nlinfo.nl_net = net,
2891         };
2892
2893         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2894         cfg.fc_dst = *prefix;
2895         cfg.fc_gateway = *gwaddr;
2896
2897         /* We should treat it as a default route if prefix length is 0. */
2898         if (!prefixlen)
2899                 cfg.fc_flags |= RTF_DEFAULT;
2900
2901         ip6_route_add(&cfg, NULL);
2902
2903         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2904 }
2905 #endif
2906
2907 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2908 {
2909         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2910         struct rt6_info *rt;
2911         struct fib6_table *table;
2912
2913         table = fib6_get_table(dev_net(dev), tb_id);
2914         if (!table)
2915                 return NULL;
2916
2917         read_lock_bh(&table->tb6_lock);
2918         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2919                 if (dev == rt->dst.dev &&
2920                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2921                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2922                         break;
2923         }
2924         if (rt)
2925                 dst_hold(&rt->dst);
2926         read_unlock_bh(&table->tb6_lock);
2927         return rt;
2928 }
2929
2930 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2931                                      struct net_device *dev,
2932                                      unsigned int pref)
2933 {
2934         struct fib6_config cfg = {
2935                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2936                 .fc_metric      = IP6_RT_PRIO_USER,
2937                 .fc_ifindex     = dev->ifindex,
2938                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2939                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2940                 .fc_protocol = RTPROT_RA,
2941                 .fc_nlinfo.portid = 0,
2942                 .fc_nlinfo.nlh = NULL,
2943                 .fc_nlinfo.nl_net = dev_net(dev),
2944         };
2945
2946         cfg.fc_gateway = *gwaddr;
2947
2948         if (!ip6_route_add(&cfg, NULL)) {
2949                 struct fib6_table *table;
2950
2951                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2952                 if (table)
2953                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2954         }
2955
2956         return rt6_get_dflt_router(gwaddr, dev);
2957 }
2958
2959 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2960 {
2961         struct rt6_info *rt;
2962
2963 restart:
2964         read_lock_bh(&table->tb6_lock);
2965         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2966                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2967                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2968                         dst_hold(&rt->dst);
2969                         read_unlock_bh(&table->tb6_lock);
2970                         ip6_del_rt(rt);
2971                         goto restart;
2972                 }
2973         }
2974         read_unlock_bh(&table->tb6_lock);
2975
2976         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2977 }
2978
2979 void rt6_purge_dflt_routers(struct net *net)
2980 {
2981         struct fib6_table *table;
2982         struct hlist_head *head;
2983         unsigned int h;
2984
2985         rcu_read_lock();
2986
2987         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2988                 head = &net->ipv6.fib_table_hash[h];
2989                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2990                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2991                                 __rt6_purge_dflt_routers(table);
2992                 }
2993         }
2994
2995         rcu_read_unlock();
2996 }
2997
2998 static void rtmsg_to_fib6_config(struct net *net,
2999                                  struct in6_rtmsg *rtmsg,
3000                                  struct fib6_config *cfg)
3001 {
3002         memset(cfg, 0, sizeof(*cfg));
3003
3004         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3005                          : RT6_TABLE_MAIN;
3006         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3007         cfg->fc_metric = rtmsg->rtmsg_metric;
3008         cfg->fc_expires = rtmsg->rtmsg_info;
3009         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3010         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3011         cfg->fc_flags = rtmsg->rtmsg_flags;
3012
3013         cfg->fc_nlinfo.nl_net = net;
3014
3015         cfg->fc_dst = rtmsg->rtmsg_dst;
3016         cfg->fc_src = rtmsg->rtmsg_src;
3017         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3018 }
3019
3020 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3021 {
3022         struct fib6_config cfg;
3023         struct in6_rtmsg rtmsg;
3024         int err;
3025
3026         switch (cmd) {
3027         case SIOCADDRT:         /* Add a route */
3028         case SIOCDELRT:         /* Delete a route */
3029                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3030                         return -EPERM;
3031                 err = copy_from_user(&rtmsg, arg,
3032                                      sizeof(struct in6_rtmsg));
3033                 if (err)
3034                         return -EFAULT;
3035
3036                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3037
3038                 rtnl_lock();
3039                 switch (cmd) {
3040                 case SIOCADDRT:
3041                         err = ip6_route_add(&cfg, NULL);
3042                         break;
3043                 case SIOCDELRT:
3044                         err = ip6_route_del(&cfg, NULL);
3045                         break;
3046                 default:
3047                         err = -EINVAL;
3048                 }
3049                 rtnl_unlock();
3050
3051                 return err;
3052         }
3053
3054         return -EINVAL;
3055 }
3056
3057 /*
3058  *      Drop the packet on the floor
3059  */
3060
3061 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3062 {
3063         int type;
3064         struct dst_entry *dst = skb_dst(skb);
3065         switch (ipstats_mib_noroutes) {
3066         case IPSTATS_MIB_INNOROUTES:
3067                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3068                 if (type == IPV6_ADDR_ANY) {
3069                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3070                                       IPSTATS_MIB_INADDRERRORS);
3071                         break;
3072                 }
3073                 /* FALLTHROUGH */
3074         case IPSTATS_MIB_OUTNOROUTES:
3075                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3076                               ipstats_mib_noroutes);
3077                 break;
3078         }
3079         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3080         kfree_skb(skb);
3081         return 0;
3082 }
3083
3084 static int ip6_pkt_discard(struct sk_buff *skb)
3085 {
3086         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3087 }
3088
3089 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3090 {
3091         skb->dev = skb_dst(skb)->dev;
3092         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3093 }
3094
3095 static int ip6_pkt_prohibit(struct sk_buff *skb)
3096 {
3097         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3098 }
3099
3100 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3101 {
3102         skb->dev = skb_dst(skb)->dev;
3103         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3104 }
3105
3106 /*
3107  *      Allocate a dst for local (unicast / anycast) address.
3108  */
3109
3110 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3111                                     const struct in6_addr *addr,
3112                                     bool anycast)
3113 {
3114         u32 tb_id;
3115         struct net *net = dev_net(idev->dev);
3116         struct net_device *dev = idev->dev;
3117         struct rt6_info *rt;
3118
3119         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3120         if (!rt)
3121                 return ERR_PTR(-ENOMEM);
3122
3123         in6_dev_hold(idev);
3124
3125         rt->dst.flags |= DST_HOST;
3126         rt->dst.input = ip6_input;
3127         rt->dst.output = ip6_output;
3128         rt->rt6i_idev = idev;
3129
3130         rt->rt6i_protocol = RTPROT_KERNEL;
3131         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3132         if (anycast)
3133                 rt->rt6i_flags |= RTF_ANYCAST;
3134         else
3135                 rt->rt6i_flags |= RTF_LOCAL;
3136
3137         rt->rt6i_gateway  = *addr;
3138         rt->rt6i_dst.addr = *addr;
3139         rt->rt6i_dst.plen = 128;
3140         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3141         rt->rt6i_table = fib6_get_table(net, tb_id);
3142
3143         return rt;
3144 }
3145
3146 /* remove deleted ip from prefsrc entries */
3147 struct arg_dev_net_ip {
3148         struct net_device *dev;
3149         struct net *net;
3150         struct in6_addr *addr;
3151 };
3152
3153 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3154 {
3155         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3156         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3157         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3158
3159         if (((void *)rt->dst.dev == dev || !dev) &&
3160             rt != net->ipv6.ip6_null_entry &&
3161             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3162                 /* remove prefsrc entry */
3163                 rt->rt6i_prefsrc.plen = 0;
3164         }
3165         return 0;
3166 }
3167
3168 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3169 {
3170         struct net *net = dev_net(ifp->idev->dev);
3171         struct arg_dev_net_ip adni = {
3172                 .dev = ifp->idev->dev,
3173                 .net = net,
3174                 .addr = &ifp->addr,
3175         };
3176         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3177 }
3178
3179 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3180 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
3181
3182 /* Remove routers and update dst entries when gateway turn into host. */
3183 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3184 {
3185         struct in6_addr *gateway = (struct in6_addr *)arg;
3186
3187         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
3188              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
3189              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3190                 return -1;
3191         }
3192         return 0;
3193 }
3194
3195 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3196 {
3197         fib6_clean_all(net, fib6_clean_tohost, gateway);
3198 }
3199
3200 struct arg_dev_net {
3201         struct net_device *dev;
3202         struct net *net;
3203 };
3204
3205 /* called with write lock held for table with rt */
3206 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3207 {
3208         const struct arg_dev_net *adn = arg;
3209         const struct net_device *dev = adn->dev;
3210
3211         if ((rt->dst.dev == dev || !dev) &&
3212             rt != adn->net->ipv6.ip6_null_entry &&
3213             (rt->rt6i_nsiblings == 0 ||
3214              (dev && netdev_unregistering(dev)) ||
3215              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3216                 return -1;
3217
3218         return 0;
3219 }
3220
3221 void rt6_ifdown(struct net *net, struct net_device *dev)
3222 {
3223         struct arg_dev_net adn = {
3224                 .dev = dev,
3225                 .net = net,
3226         };
3227
3228         fib6_clean_all(net, fib6_ifdown, &adn);
3229         if (dev)
3230                 rt6_uncached_list_flush_dev(net, dev);
3231 }
3232
3233 struct rt6_mtu_change_arg {
3234         struct net_device *dev;
3235         unsigned int mtu;
3236 };
3237
3238 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3239 {
3240         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3241         struct inet6_dev *idev;
3242
3243         /* In IPv6 pmtu discovery is not optional,
3244            so that RTAX_MTU lock cannot disable it.
3245            We still use this lock to block changes
3246            caused by addrconf/ndisc.
3247         */
3248
3249         idev = __in6_dev_get(arg->dev);
3250         if (!idev)
3251                 return 0;
3252
3253         /* For administrative MTU increase, there is no way to discover
3254            IPv6 PMTU increase, so PMTU increase should be updated here.
3255            Since RFC 1981 doesn't include administrative MTU increase
3256            update PMTU increase is a MUST. (i.e. jumbo frame)
3257          */
3258         /*
3259            If new MTU is less than route PMTU, this new MTU will be the
3260            lowest MTU in the path, update the route PMTU to reflect PMTU
3261            decreases; if new MTU is greater than route PMTU, and the
3262            old MTU is the lowest MTU in the path, update the route PMTU
3263            to reflect the increase. In this case if the other nodes' MTU
3264            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3265            PMTU discovery.
3266          */
3267         if (rt->dst.dev == arg->dev &&
3268             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3269             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3270                 if (rt->rt6i_flags & RTF_CACHE) {
3271                         /* For RTF_CACHE with rt6i_pmtu == 0
3272                          * (i.e. a redirected route),
3273                          * the metrics of its rt->dst.from has already
3274                          * been updated.
3275                          */
3276                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
3277                                 rt->rt6i_pmtu = arg->mtu;
3278                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
3279                            (dst_mtu(&rt->dst) < arg->mtu &&
3280                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3281                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3282                 }
3283         }
3284         return 0;
3285 }
3286
3287 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3288 {
3289         struct rt6_mtu_change_arg arg = {
3290                 .dev = dev,
3291                 .mtu = mtu,
3292         };
3293
3294         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3295 }
3296
3297 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3298         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3299         [RTA_OIF]               = { .type = NLA_U32 },
3300         [RTA_IIF]               = { .type = NLA_U32 },
3301         [RTA_PRIORITY]          = { .type = NLA_U32 },
3302         [RTA_METRICS]           = { .type = NLA_NESTED },
3303         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3304         [RTA_PREF]              = { .type = NLA_U8 },
3305         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3306         [RTA_ENCAP]             = { .type = NLA_NESTED },
3307         [RTA_EXPIRES]           = { .type = NLA_U32 },
3308         [RTA_UID]               = { .type = NLA_U32 },
3309         [RTA_MARK]              = { .type = NLA_U32 },
3310 };
3311
3312 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3313                               struct fib6_config *cfg,
3314                               struct netlink_ext_ack *extack)
3315 {
3316         struct rtmsg *rtm;
3317         struct nlattr *tb[RTA_MAX+1];
3318         unsigned int pref;
3319         int err;
3320
3321         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3322                           NULL);
3323         if (err < 0)
3324                 goto errout;
3325
3326         err = -EINVAL;
3327         rtm = nlmsg_data(nlh);
3328         memset(cfg, 0, sizeof(*cfg));
3329
3330         cfg->fc_table = rtm->rtm_table;
3331         cfg->fc_dst_len = rtm->rtm_dst_len;
3332         cfg->fc_src_len = rtm->rtm_src_len;
3333         cfg->fc_flags = RTF_UP;
3334         cfg->fc_protocol = rtm->rtm_protocol;
3335         cfg->fc_type = rtm->rtm_type;
3336
3337         if (rtm->rtm_type == RTN_UNREACHABLE ||
3338             rtm->rtm_type == RTN_BLACKHOLE ||
3339             rtm->rtm_type == RTN_PROHIBIT ||
3340             rtm->rtm_type == RTN_THROW)
3341                 cfg->fc_flags |= RTF_REJECT;
3342
3343         if (rtm->rtm_type == RTN_LOCAL)
3344                 cfg->fc_flags |= RTF_LOCAL;
3345
3346         if (rtm->rtm_flags & RTM_F_CLONED)
3347                 cfg->fc_flags |= RTF_CACHE;
3348
3349         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3350         cfg->fc_nlinfo.nlh = nlh;
3351         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3352
3353         if (tb[RTA_GATEWAY]) {
3354                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3355                 cfg->fc_flags |= RTF_GATEWAY;
3356         }
3357
3358         if (tb[RTA_DST]) {
3359                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3360
3361                 if (nla_len(tb[RTA_DST]) < plen)
3362                         goto errout;
3363
3364                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3365         }
3366
3367         if (tb[RTA_SRC]) {
3368                 int plen = (rtm->rtm_src_len + 7) >> 3;
3369
3370                 if (nla_len(tb[RTA_SRC]) < plen)
3371                         goto errout;
3372
3373                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3374         }
3375
3376         if (tb[RTA_PREFSRC])
3377                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3378
3379         if (tb[RTA_OIF])
3380                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3381
3382         if (tb[RTA_PRIORITY])
3383                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3384
3385         if (tb[RTA_METRICS]) {
3386                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3387                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3388         }
3389
3390         if (tb[RTA_TABLE])
3391                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3392
3393         if (tb[RTA_MULTIPATH]) {
3394                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3395                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3396
3397                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3398                                                      cfg->fc_mp_len, extack);
3399                 if (err < 0)
3400                         goto errout;
3401         }
3402
3403         if (tb[RTA_PREF]) {
3404                 pref = nla_get_u8(tb[RTA_PREF]);
3405                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3406                     pref != ICMPV6_ROUTER_PREF_HIGH)
3407                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3408                 cfg->fc_flags |= RTF_PREF(pref);
3409         }
3410
3411         if (tb[RTA_ENCAP])
3412                 cfg->fc_encap = tb[RTA_ENCAP];
3413
3414         if (tb[RTA_ENCAP_TYPE]) {
3415                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3416
3417                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3418                 if (err < 0)
3419                         goto errout;
3420         }
3421
3422         if (tb[RTA_EXPIRES]) {
3423                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3424
3425                 if (addrconf_finite_timeout(timeout)) {
3426                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3427                         cfg->fc_flags |= RTF_EXPIRES;
3428                 }
3429         }
3430
3431         err = 0;
3432 errout:
3433         return err;
3434 }
3435
3436 struct rt6_nh {
3437         struct rt6_info *rt6_info;
3438         struct fib6_config r_cfg;
3439         struct mx6_config mxc;
3440         struct list_head next;
3441 };
3442
3443 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3444 {
3445         struct rt6_nh *nh;
3446
3447         list_for_each_entry(nh, rt6_nh_list, next) {
3448                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3449                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3450                         nh->r_cfg.fc_ifindex);
3451         }
3452 }
3453
3454 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3455                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3456 {
3457         struct rt6_nh *nh;
3458         int err = -EEXIST;
3459
3460         list_for_each_entry(nh, rt6_nh_list, next) {
3461                 /* check if rt6_info already exists */
3462                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3463                         return err;
3464         }
3465
3466         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3467         if (!nh)
3468                 return -ENOMEM;
3469         nh->rt6_info = rt;
3470         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3471         if (err) {
3472                 kfree(nh);
3473                 return err;
3474         }
3475         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3476         list_add_tail(&nh->next, rt6_nh_list);
3477
3478         return 0;
3479 }
3480
3481 static void ip6_route_mpath_notify(struct rt6_info *rt,
3482                                    struct rt6_info *rt_last,
3483                                    struct nl_info *info,
3484                                    __u16 nlflags)
3485 {
3486         /* if this is an APPEND route, then rt points to the first route
3487          * inserted and rt_last points to last route inserted. Userspace
3488          * wants a consistent dump of the route which starts at the first
3489          * nexthop. Since sibling routes are always added at the end of
3490          * the list, find the first sibling of the last route appended
3491          */
3492         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3493                 rt = list_first_entry(&rt_last->rt6i_siblings,
3494                                       struct rt6_info,
3495                                       rt6i_siblings);
3496         }
3497
3498         if (rt)
3499                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3500 }
3501
3502 static int ip6_route_multipath_add(struct fib6_config *cfg,
3503                                    struct netlink_ext_ack *extack)
3504 {
3505         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3506         struct nl_info *info = &cfg->fc_nlinfo;
3507         struct fib6_config r_cfg;
3508         struct rtnexthop *rtnh;
3509         struct rt6_info *rt;
3510         struct rt6_nh *err_nh;
3511         struct rt6_nh *nh, *nh_safe;
3512         __u16 nlflags;
3513         int remaining;
3514         int attrlen;
3515         int err = 1;
3516         int nhn = 0;
3517         int replace = (cfg->fc_nlinfo.nlh &&
3518                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3519         LIST_HEAD(rt6_nh_list);
3520
3521         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3522         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3523                 nlflags |= NLM_F_APPEND;
3524
3525         remaining = cfg->fc_mp_len;
3526         rtnh = (struct rtnexthop *)cfg->fc_mp;
3527
3528         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3529          * rt6_info structs per nexthop
3530          */
3531         while (rtnh_ok(rtnh, remaining)) {
3532                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3533                 if (rtnh->rtnh_ifindex)
3534                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3535
3536                 attrlen = rtnh_attrlen(rtnh);
3537                 if (attrlen > 0) {
3538                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3539
3540                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3541                         if (nla) {
3542                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3543                                 r_cfg.fc_flags |= RTF_GATEWAY;
3544                         }
3545                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3546                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3547                         if (nla)
3548                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3549                 }
3550
3551                 rt = ip6_route_info_create(&r_cfg, extack);
3552                 if (IS_ERR(rt)) {
3553                         err = PTR_ERR(rt);
3554                         rt = NULL;
3555                         goto cleanup;
3556                 }
3557
3558                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3559                 if (err) {
3560                         dst_release_immediate(&rt->dst);
3561                         goto cleanup;
3562                 }
3563
3564                 rtnh = rtnh_next(rtnh, &remaining);
3565         }
3566
3567         /* for add and replace send one notification with all nexthops.
3568          * Skip the notification in fib6_add_rt2node and send one with
3569          * the full route when done
3570          */
3571         info->skip_notify = 1;
3572
3573         err_nh = NULL;
3574         list_for_each_entry(nh, &rt6_nh_list, next) {
3575                 rt_last = nh->rt6_info;
3576                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3577                 /* save reference to first route for notification */
3578                 if (!rt_notif && !err)
3579                         rt_notif = nh->rt6_info;
3580
3581                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3582                 nh->rt6_info = NULL;
3583                 if (err) {
3584                         if (replace && nhn)
3585                                 ip6_print_replace_route_err(&rt6_nh_list);
3586                         err_nh = nh;
3587                         goto add_errout;
3588                 }
3589
3590                 /* Because each route is added like a single route we remove
3591                  * these flags after the first nexthop: if there is a collision,
3592                  * we have already failed to add the first nexthop:
3593                  * fib6_add_rt2node() has rejected it; when replacing, old
3594                  * nexthops have been replaced by first new, the rest should
3595                  * be added to it.
3596                  */
3597                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3598                                                      NLM_F_REPLACE);
3599                 nhn++;
3600         }
3601
3602         /* success ... tell user about new route */
3603         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3604         goto cleanup;
3605
3606 add_errout:
3607         /* send notification for routes that were added so that
3608          * the delete notifications sent by ip6_route_del are
3609          * coherent
3610          */
3611         if (rt_notif)
3612                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3613
3614         /* Delete routes that were already added */
3615         list_for_each_entry(nh, &rt6_nh_list, next) {
3616                 if (err_nh == nh)
3617                         break;
3618                 ip6_route_del(&nh->r_cfg, extack);
3619         }
3620
3621 cleanup:
3622         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3623                 if (nh->rt6_info)
3624                         dst_release_immediate(&nh->rt6_info->dst);
3625                 kfree(nh->mxc.mx);
3626                 list_del(&nh->next);
3627                 kfree(nh);
3628         }
3629
3630         return err;
3631 }
3632
3633 static int ip6_route_multipath_del(struct fib6_config *cfg,
3634                                    struct netlink_ext_ack *extack)
3635 {
3636         struct fib6_config r_cfg;
3637         struct rtnexthop *rtnh;
3638         int remaining;
3639         int attrlen;
3640         int err = 1, last_err = 0;
3641
3642         remaining = cfg->fc_mp_len;
3643         rtnh = (struct rtnexthop *)cfg->fc_mp;
3644
3645         /* Parse a Multipath Entry */
3646         while (rtnh_ok(rtnh, remaining)) {
3647                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3648                 if (rtnh->rtnh_ifindex)
3649                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3650
3651                 attrlen = rtnh_attrlen(rtnh);
3652                 if (attrlen > 0) {
3653                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3654
3655                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3656                         if (nla) {
3657                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3658                                 r_cfg.fc_flags |= RTF_GATEWAY;
3659                         }
3660                 }
3661                 err = ip6_route_del(&r_cfg, extack);
3662                 if (err)
3663                         last_err = err;
3664
3665                 rtnh = rtnh_next(rtnh, &remaining);
3666         }
3667
3668         return last_err;
3669 }
3670
3671 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3672                               struct netlink_ext_ack *extack)
3673 {
3674         struct fib6_config cfg;
3675         int err;
3676
3677         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3678         if (err < 0)
3679                 return err;
3680
3681         if (cfg.fc_mp)
3682                 return ip6_route_multipath_del(&cfg, extack);
3683         else {
3684                 cfg.fc_delete_all_nh = 1;
3685                 return ip6_route_del(&cfg, extack);
3686         }
3687 }
3688
3689 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3690                               struct netlink_ext_ack *extack)
3691 {
3692         struct fib6_config cfg;
3693         int err;
3694
3695         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3696         if (err < 0)
3697                 return err;
3698
3699         if (cfg.fc_mp)
3700                 return ip6_route_multipath_add(&cfg, extack);
3701         else
3702                 return ip6_route_add(&cfg, extack);
3703 }
3704
3705 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3706 {
3707         int nexthop_len = 0;
3708
3709         if (rt->rt6i_nsiblings) {
3710                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3711                             + NLA_ALIGN(sizeof(struct rtnexthop))
3712                             + nla_total_size(16) /* RTA_GATEWAY */
3713                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3714
3715                 nexthop_len *= rt->rt6i_nsiblings;
3716         }
3717
3718         return NLMSG_ALIGN(sizeof(struct rtmsg))
3719                + nla_total_size(16) /* RTA_SRC */
3720                + nla_total_size(16) /* RTA_DST */
3721                + nla_total_size(16) /* RTA_GATEWAY */
3722                + nla_total_size(16) /* RTA_PREFSRC */
3723                + nla_total_size(4) /* RTA_TABLE */
3724                + nla_total_size(4) /* RTA_IIF */
3725                + nla_total_size(4) /* RTA_OIF */
3726                + nla_total_size(4) /* RTA_PRIORITY */
3727                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3728                + nla_total_size(sizeof(struct rta_cacheinfo))
3729                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3730                + nla_total_size(1) /* RTA_PREF */
3731                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3732                + nexthop_len;
3733 }
3734
3735 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3736                             unsigned int *flags, bool skip_oif)
3737 {
3738         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3739                 *flags |= RTNH_F_LINKDOWN;
3740                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3741                         *flags |= RTNH_F_DEAD;
3742         }
3743
3744         if (rt->rt6i_flags & RTF_GATEWAY) {
3745                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3746                         goto nla_put_failure;
3747         }
3748
3749         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3750                 *flags |= RTNH_F_OFFLOAD;
3751
3752         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3753         if (!skip_oif && rt->dst.dev &&
3754             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3755                 goto nla_put_failure;
3756
3757         if (rt->dst.lwtstate &&
3758             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3759                 goto nla_put_failure;
3760
3761         return 0;
3762
3763 nla_put_failure:
3764         return -EMSGSIZE;
3765 }
3766
3767 /* add multipath next hop */
3768 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3769 {
3770         struct rtnexthop *rtnh;
3771         unsigned int flags = 0;
3772
3773         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3774         if (!rtnh)
3775                 goto nla_put_failure;
3776
3777         rtnh->rtnh_hops = 0;
3778         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3779
3780         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3781                 goto nla_put_failure;
3782
3783         rtnh->rtnh_flags = flags;
3784
3785         /* length of rtnetlink header + attributes */
3786         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3787
3788         return 0;
3789
3790 nla_put_failure:
3791         return -EMSGSIZE;
3792 }
3793
3794 static int rt6_fill_node(struct net *net,
3795                          struct sk_buff *skb, struct rt6_info *rt,
3796                          struct in6_addr *dst, struct in6_addr *src,
3797                          int iif, int type, u32 portid, u32 seq,
3798                          unsigned int flags)
3799 {
3800         u32 metrics[RTAX_MAX];
3801         struct rtmsg *rtm;
3802         struct nlmsghdr *nlh;
3803         long expires;
3804         u32 table;
3805
3806         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3807         if (!nlh)
3808                 return -EMSGSIZE;
3809
3810         rtm = nlmsg_data(nlh);
3811         rtm->rtm_family = AF_INET6;
3812         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3813         rtm->rtm_src_len = rt->rt6i_src.plen;
3814         rtm->rtm_tos = 0;
3815         if (rt->rt6i_table)
3816                 table = rt->rt6i_table->tb6_id;
3817         else
3818                 table = RT6_TABLE_UNSPEC;
3819         rtm->rtm_table = table;
3820         if (nla_put_u32(skb, RTA_TABLE, table))
3821                 goto nla_put_failure;
3822         if (rt->rt6i_flags & RTF_REJECT) {
3823                 switch (rt->dst.error) {
3824                 case -EINVAL:
3825                         rtm->rtm_type = RTN_BLACKHOLE;
3826                         break;
3827                 case -EACCES:
3828                         rtm->rtm_type = RTN_PROHIBIT;
3829                         break;
3830                 case -EAGAIN:
3831                         rtm->rtm_type = RTN_THROW;
3832                         break;
3833                 default:
3834                         rtm->rtm_type = RTN_UNREACHABLE;
3835                         break;
3836                 }
3837         }
3838         else if (rt->rt6i_flags & RTF_LOCAL)
3839                 rtm->rtm_type = RTN_LOCAL;
3840         else if (rt->rt6i_flags & RTF_ANYCAST)
3841                 rtm->rtm_type = RTN_ANYCAST;
3842         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3843                 rtm->rtm_type = RTN_LOCAL;
3844         else
3845                 rtm->rtm_type = RTN_UNICAST;
3846         rtm->rtm_flags = 0;
3847         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3848         rtm->rtm_protocol = rt->rt6i_protocol;
3849
3850         if (rt->rt6i_flags & RTF_CACHE)
3851                 rtm->rtm_flags |= RTM_F_CLONED;
3852
3853         if (dst) {
3854                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3855                         goto nla_put_failure;
3856                 rtm->rtm_dst_len = 128;
3857         } else if (rtm->rtm_dst_len)
3858                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3859                         goto nla_put_failure;
3860 #ifdef CONFIG_IPV6_SUBTREES
3861         if (src) {
3862                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3863                         goto nla_put_failure;
3864                 rtm->rtm_src_len = 128;
3865         } else if (rtm->rtm_src_len &&
3866                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3867                 goto nla_put_failure;
3868 #endif
3869         if (iif) {
3870 #ifdef CONFIG_IPV6_MROUTE
3871                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3872                         int err = ip6mr_get_route(net, skb, rtm, portid);
3873
3874                         if (err == 0)
3875                                 return 0;
3876                         if (err < 0)
3877                                 goto nla_put_failure;
3878                 } else
3879 #endif
3880                         if (nla_put_u32(skb, RTA_IIF, iif))
3881                                 goto nla_put_failure;
3882         } else if (dst) {
3883                 struct in6_addr saddr_buf;
3884                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3885                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3886                         goto nla_put_failure;
3887         }
3888
3889         if (rt->rt6i_prefsrc.plen) {
3890                 struct in6_addr saddr_buf;
3891                 saddr_buf = rt->rt6i_prefsrc.addr;
3892                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3893                         goto nla_put_failure;
3894         }
3895
3896         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3897         if (rt->rt6i_pmtu)
3898                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3899         if (rtnetlink_put_metrics(skb, metrics) < 0)
3900                 goto nla_put_failure;
3901
3902         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3903                 goto nla_put_failure;
3904
3905         /* For multipath routes, walk the siblings list and add
3906          * each as a nexthop within RTA_MULTIPATH.
3907          */
3908         if (rt->rt6i_nsiblings) {
3909                 struct rt6_info *sibling, *next_sibling;
3910                 struct nlattr *mp;
3911
3912                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3913                 if (!mp)
3914                         goto nla_put_failure;
3915
3916                 if (rt6_add_nexthop(skb, rt) < 0)
3917                         goto nla_put_failure;
3918
3919                 list_for_each_entry_safe(sibling, next_sibling,
3920                                          &rt->rt6i_siblings, rt6i_siblings) {
3921                         if (rt6_add_nexthop(skb, sibling) < 0)
3922                                 goto nla_put_failure;
3923                 }
3924
3925                 nla_nest_end(skb, mp);
3926         } else {
3927                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3928                         goto nla_put_failure;
3929         }
3930
3931         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3932
3933         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3934                 goto nla_put_failure;
3935
3936         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3937                 goto nla_put_failure;
3938
3939
3940         nlmsg_end(skb, nlh);
3941         return 0;
3942
3943 nla_put_failure:
3944         nlmsg_cancel(skb, nlh);
3945         return -EMSGSIZE;
3946 }
3947
3948 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3949 {
3950         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3951         struct net *net = arg->net;
3952
3953         if (rt == net->ipv6.ip6_null_entry)
3954                 return 0;
3955
3956         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3957                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3958
3959                 /* user wants prefix routes only */
3960                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3961                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3962                         /* success since this is not a prefix route */
3963                         return 1;
3964                 }
3965         }
3966
3967         return rt6_fill_node(net,
3968                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3969                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3970                      NLM_F_MULTI);
3971 }
3972
3973 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3974                               struct netlink_ext_ack *extack)
3975 {
3976         struct net *net = sock_net(in_skb->sk);
3977         struct nlattr *tb[RTA_MAX+1];
3978         int err, iif = 0, oif = 0;
3979         struct dst_entry *dst;
3980         struct rt6_info *rt;
3981         struct sk_buff *skb;
3982         struct rtmsg *rtm;
3983         struct flowi6 fl6;
3984         bool fibmatch;
3985
3986         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3987                           extack);
3988         if (err < 0)
3989                 goto errout;
3990
3991         err = -EINVAL;
3992         memset(&fl6, 0, sizeof(fl6));
3993         rtm = nlmsg_data(nlh);
3994         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3995         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3996
3997         if (tb[RTA_SRC]) {
3998                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3999                         goto errout;
4000
4001                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4002         }
4003
4004         if (tb[RTA_DST]) {
4005                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4006                         goto errout;
4007
4008                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4009         }
4010
4011         if (tb[RTA_IIF])
4012                 iif = nla_get_u32(tb[RTA_IIF]);
4013
4014         if (tb[RTA_OIF])
4015                 oif = nla_get_u32(tb[RTA_OIF]);
4016
4017         if (tb[RTA_MARK])
4018                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4019
4020         if (tb[RTA_UID])
4021                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4022                                            nla_get_u32(tb[RTA_UID]));
4023         else
4024                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4025
4026         if (iif) {
4027                 struct net_device *dev;
4028                 int flags = 0;
4029
4030                 rcu_read_lock();
4031
4032                 dev = dev_get_by_index_rcu(net, iif);
4033                 if (!dev) {
4034                         rcu_read_unlock();
4035                         err = -ENODEV;
4036                         goto errout;
4037                 }
4038
4039                 fl6.flowi6_iif = iif;
4040
4041                 if (!ipv6_addr_any(&fl6.saddr))
4042                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4043
4044                 if (!fibmatch)
4045                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4046                 else
4047                         dst = ip6_route_lookup(net, &fl6, 0);
4048
4049                 rcu_read_unlock();
4050         } else {
4051                 fl6.flowi6_oif = oif;
4052
4053                 if (!fibmatch)
4054                         dst = ip6_route_output(net, NULL, &fl6);
4055                 else
4056                         dst = ip6_route_lookup(net, &fl6, 0);
4057         }
4058
4059
4060         rt = container_of(dst, struct rt6_info, dst);
4061         if (rt->dst.error) {
4062                 err = rt->dst.error;
4063                 ip6_rt_put(rt);
4064                 goto errout;
4065         }
4066
4067         if (rt == net->ipv6.ip6_null_entry) {
4068                 err = rt->dst.error;
4069                 ip6_rt_put(rt);
4070                 goto errout;
4071         }
4072
4073         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4074         if (!skb) {
4075                 ip6_rt_put(rt);
4076                 err = -ENOBUFS;
4077                 goto errout;
4078         }
4079
4080         skb_dst_set(skb, &rt->dst);
4081         if (fibmatch)
4082                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4083                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4084                                     nlh->nlmsg_seq, 0);
4085         else
4086                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4087                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4088                                     nlh->nlmsg_seq, 0);
4089         if (err < 0) {
4090                 kfree_skb(skb);
4091                 goto errout;
4092         }
4093
4094         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4095 errout:
4096         return err;
4097 }
4098
4099 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4100                      unsigned int nlm_flags)
4101 {
4102         struct sk_buff *skb;
4103         struct net *net = info->nl_net;
4104         u32 seq;
4105         int err;
4106
4107         err = -ENOBUFS;
4108         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4109
4110         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4111         if (!skb)
4112                 goto errout;
4113
4114         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4115                                 event, info->portid, seq, nlm_flags);
4116         if (err < 0) {
4117                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4118                 WARN_ON(err == -EMSGSIZE);
4119                 kfree_skb(skb);
4120                 goto errout;
4121         }
4122         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4123                     info->nlh, gfp_any());
4124         return;
4125 errout:
4126         if (err < 0)
4127                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4128 }
4129
4130 static int ip6_route_dev_notify(struct notifier_block *this,
4131                                 unsigned long event, void *ptr)
4132 {
4133         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4134         struct net *net = dev_net(dev);
4135
4136         if (!(dev->flags & IFF_LOOPBACK))
4137                 return NOTIFY_OK;
4138
4139         if (event == NETDEV_REGISTER) {
4140                 net->ipv6.ip6_null_entry->dst.dev = dev;
4141                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4143                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4144                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4145                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4146                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4147 #endif
4148          } else if (event == NETDEV_UNREGISTER &&
4149                     dev->reg_state != NETREG_UNREGISTERED) {
4150                 /* NETDEV_UNREGISTER could be fired for multiple times by
4151                  * netdev_wait_allrefs(). Make sure we only call this once.
4152                  */
4153                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4154 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4155                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4156                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4157 #endif
4158         }
4159
4160         return NOTIFY_OK;
4161 }
4162
4163 /*
4164  *      /proc
4165  */
4166
4167 #ifdef CONFIG_PROC_FS
4168
4169 static const struct file_operations ipv6_route_proc_fops = {
4170         .owner          = THIS_MODULE,
4171         .open           = ipv6_route_open,
4172         .read           = seq_read,
4173         .llseek         = seq_lseek,
4174         .release        = seq_release_net,
4175 };
4176
4177 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4178 {
4179         struct net *net = (struct net *)seq->private;
4180         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4181                    net->ipv6.rt6_stats->fib_nodes,
4182                    net->ipv6.rt6_stats->fib_route_nodes,
4183                    net->ipv6.rt6_stats->fib_rt_alloc,
4184                    net->ipv6.rt6_stats->fib_rt_entries,
4185                    net->ipv6.rt6_stats->fib_rt_cache,
4186                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4187                    net->ipv6.rt6_stats->fib_discarded_routes);
4188
4189         return 0;
4190 }
4191
4192 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4193 {
4194         return single_open_net(inode, file, rt6_stats_seq_show);
4195 }
4196
4197 static const struct file_operations rt6_stats_seq_fops = {
4198         .owner   = THIS_MODULE,
4199         .open    = rt6_stats_seq_open,
4200         .read    = seq_read,
4201         .llseek  = seq_lseek,
4202         .release = single_release_net,
4203 };
4204 #endif  /* CONFIG_PROC_FS */
4205
4206 #ifdef CONFIG_SYSCTL
4207
4208 static
4209 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4210                               void __user *buffer, size_t *lenp, loff_t *ppos)
4211 {
4212         struct net *net;
4213         int delay;
4214         if (!write)
4215                 return -EINVAL;
4216
4217         net = (struct net *)ctl->extra1;
4218         delay = net->ipv6.sysctl.flush_delay;
4219         proc_dointvec(ctl, write, buffer, lenp, ppos);
4220         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4221         return 0;
4222 }
4223
4224 struct ctl_table ipv6_route_table_template[] = {
4225         {
4226                 .procname       =       "flush",
4227                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4228                 .maxlen         =       sizeof(int),
4229                 .mode           =       0200,
4230                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4231         },
4232         {
4233                 .procname       =       "gc_thresh",
4234                 .data           =       &ip6_dst_ops_template.gc_thresh,
4235                 .maxlen         =       sizeof(int),
4236                 .mode           =       0644,
4237                 .proc_handler   =       proc_dointvec,
4238         },
4239         {
4240                 .procname       =       "max_size",
4241                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4242                 .maxlen         =       sizeof(int),
4243                 .mode           =       0644,
4244                 .proc_handler   =       proc_dointvec,
4245         },
4246         {
4247                 .procname       =       "gc_min_interval",
4248                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4249                 .maxlen         =       sizeof(int),
4250                 .mode           =       0644,
4251                 .proc_handler   =       proc_dointvec_jiffies,
4252         },
4253         {
4254                 .procname       =       "gc_timeout",
4255                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4256                 .maxlen         =       sizeof(int),
4257                 .mode           =       0644,
4258                 .proc_handler   =       proc_dointvec_jiffies,
4259         },
4260         {
4261                 .procname       =       "gc_interval",
4262                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4263                 .maxlen         =       sizeof(int),
4264                 .mode           =       0644,
4265                 .proc_handler   =       proc_dointvec_jiffies,
4266         },
4267         {
4268                 .procname       =       "gc_elasticity",
4269                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4270                 .maxlen         =       sizeof(int),
4271                 .mode           =       0644,
4272                 .proc_handler   =       proc_dointvec,
4273         },
4274         {
4275                 .procname       =       "mtu_expires",
4276                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4277                 .maxlen         =       sizeof(int),
4278                 .mode           =       0644,
4279                 .proc_handler   =       proc_dointvec_jiffies,
4280         },
4281         {
4282                 .procname       =       "min_adv_mss",
4283                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4284                 .maxlen         =       sizeof(int),
4285                 .mode           =       0644,
4286                 .proc_handler   =       proc_dointvec,
4287         },
4288         {
4289                 .procname       =       "gc_min_interval_ms",
4290                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4291                 .maxlen         =       sizeof(int),
4292                 .mode           =       0644,
4293                 .proc_handler   =       proc_dointvec_ms_jiffies,
4294         },
4295         { }
4296 };
4297
4298 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4299 {
4300         struct ctl_table *table;
4301
4302         table = kmemdup(ipv6_route_table_template,
4303                         sizeof(ipv6_route_table_template),
4304                         GFP_KERNEL);
4305
4306         if (table) {
4307                 table[0].data = &net->ipv6.sysctl.flush_delay;
4308                 table[0].extra1 = net;
4309                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4310                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4311                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4312                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4313                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4314                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4315                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4316                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4317                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4318
4319                 /* Don't export sysctls to unprivileged users */
4320                 if (net->user_ns != &init_user_ns)
4321                         table[0].procname = NULL;
4322         }
4323
4324         return table;
4325 }
4326 #endif
4327
4328 static int __net_init ip6_route_net_init(struct net *net)
4329 {
4330         int ret = -ENOMEM;
4331
4332         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4333                sizeof(net->ipv6.ip6_dst_ops));
4334
4335         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4336                 goto out_ip6_dst_ops;
4337
4338         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4339                                            sizeof(*net->ipv6.ip6_null_entry),
4340                                            GFP_KERNEL);
4341         if (!net->ipv6.ip6_null_entry)
4342                 goto out_ip6_dst_entries;
4343         net->ipv6.ip6_null_entry->dst.path =
4344                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4345         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4346         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4347                          ip6_template_metrics, true);
4348
4349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4350         net->ipv6.fib6_has_custom_rules = false;
4351         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4352                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4353                                                GFP_KERNEL);
4354         if (!net->ipv6.ip6_prohibit_entry)
4355                 goto out_ip6_null_entry;
4356         net->ipv6.ip6_prohibit_entry->dst.path =
4357                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4358         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4359         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4360                          ip6_template_metrics, true);
4361
4362         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4363                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4364                                                GFP_KERNEL);
4365         if (!net->ipv6.ip6_blk_hole_entry)
4366                 goto out_ip6_prohibit_entry;
4367         net->ipv6.ip6_blk_hole_entry->dst.path =
4368                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4369         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4370         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4371                          ip6_template_metrics, true);
4372 #endif
4373
4374         net->ipv6.sysctl.flush_delay = 0;
4375         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4376         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4377         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4378         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4379         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4380         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4381         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4382
4383         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4384
4385         ret = 0;
4386 out:
4387         return ret;
4388
4389 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4390 out_ip6_prohibit_entry:
4391         kfree(net->ipv6.ip6_prohibit_entry);
4392 out_ip6_null_entry:
4393         kfree(net->ipv6.ip6_null_entry);
4394 #endif
4395 out_ip6_dst_entries:
4396         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4397 out_ip6_dst_ops:
4398         goto out;
4399 }
4400
4401 static void __net_exit ip6_route_net_exit(struct net *net)
4402 {
4403         kfree(net->ipv6.ip6_null_entry);
4404 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4405         kfree(net->ipv6.ip6_prohibit_entry);
4406         kfree(net->ipv6.ip6_blk_hole_entry);
4407 #endif
4408         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4409 }
4410
4411 static int __net_init ip6_route_net_init_late(struct net *net)
4412 {
4413 #ifdef CONFIG_PROC_FS
4414         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4415         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4416 #endif
4417         return 0;
4418 }
4419
4420 static void __net_exit ip6_route_net_exit_late(struct net *net)
4421 {
4422 #ifdef CONFIG_PROC_FS
4423         remove_proc_entry("ipv6_route", net->proc_net);
4424         remove_proc_entry("rt6_stats", net->proc_net);
4425 #endif
4426 }
4427
4428 static struct pernet_operations ip6_route_net_ops = {
4429         .init = ip6_route_net_init,
4430         .exit = ip6_route_net_exit,
4431 };
4432
4433 static int __net_init ipv6_inetpeer_init(struct net *net)
4434 {
4435         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4436
4437         if (!bp)
4438                 return -ENOMEM;
4439         inet_peer_base_init(bp);
4440         net->ipv6.peers = bp;
4441         return 0;
4442 }
4443
4444 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4445 {
4446         struct inet_peer_base *bp = net->ipv6.peers;
4447
4448         net->ipv6.peers = NULL;
4449         inetpeer_invalidate_tree(bp);
4450         kfree(bp);
4451 }
4452
4453 static struct pernet_operations ipv6_inetpeer_ops = {
4454         .init   =       ipv6_inetpeer_init,
4455         .exit   =       ipv6_inetpeer_exit,
4456 };
4457
4458 static struct pernet_operations ip6_route_net_late_ops = {
4459         .init = ip6_route_net_init_late,
4460         .exit = ip6_route_net_exit_late,
4461 };
4462
4463 static struct notifier_block ip6_route_dev_notifier = {
4464         .notifier_call = ip6_route_dev_notify,
4465         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4466 };
4467
4468 void __init ip6_route_init_special_entries(void)
4469 {
4470         /* Registering of the loopback is done before this portion of code,
4471          * the loopback reference in rt6_info will not be taken, do it
4472          * manually for init_net */
4473         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4474         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4475   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4476         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4477         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4478         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4479         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4480   #endif
4481 }
4482
4483 int __init ip6_route_init(void)
4484 {
4485         int ret;
4486         int cpu;
4487
4488         ret = -ENOMEM;
4489         ip6_dst_ops_template.kmem_cachep =
4490                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4491                                   SLAB_HWCACHE_ALIGN, NULL);
4492         if (!ip6_dst_ops_template.kmem_cachep)
4493                 goto out;
4494
4495         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4496         if (ret)
4497                 goto out_kmem_cache;
4498
4499         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4500         if (ret)
4501                 goto out_dst_entries;
4502
4503         ret = register_pernet_subsys(&ip6_route_net_ops);
4504         if (ret)
4505                 goto out_register_inetpeer;
4506
4507         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4508
4509         ret = fib6_init();
4510         if (ret)
4511                 goto out_register_subsys;
4512
4513         ret = xfrm6_init();
4514         if (ret)
4515                 goto out_fib6_init;
4516
4517         ret = fib6_rules_init();
4518         if (ret)
4519                 goto xfrm6_init;
4520
4521         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4522         if (ret)
4523                 goto fib6_rules_init;
4524
4525         ret = -ENOBUFS;
4526         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4527             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4528             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4529                             RTNL_FLAG_DOIT_UNLOCKED))
4530                 goto out_register_late_subsys;
4531
4532         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4533         if (ret)
4534                 goto out_register_late_subsys;
4535
4536         for_each_possible_cpu(cpu) {
4537                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4538
4539                 INIT_LIST_HEAD(&ul->head);
4540                 spin_lock_init(&ul->lock);
4541         }
4542
4543 out:
4544         return ret;
4545
4546 out_register_late_subsys:
4547         unregister_pernet_subsys(&ip6_route_net_late_ops);
4548 fib6_rules_init:
4549         fib6_rules_cleanup();
4550 xfrm6_init:
4551         xfrm6_fini();
4552 out_fib6_init:
4553         fib6_gc_cleanup();
4554 out_register_subsys:
4555         unregister_pernet_subsys(&ip6_route_net_ops);
4556 out_register_inetpeer:
4557         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4558 out_dst_entries:
4559         dst_entries_destroy(&ip6_dst_blackhole_ops);
4560 out_kmem_cache:
4561         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4562         goto out;
4563 }
4564
4565 void ip6_route_cleanup(void)
4566 {
4567         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4568         unregister_pernet_subsys(&ip6_route_net_late_ops);
4569         fib6_rules_cleanup();
4570         xfrm6_fini();
4571         fib6_gc_cleanup();
4572         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4573         unregister_pernet_subsys(&ip6_route_net_ops);
4574         dst_entries_destroy(&ip6_dst_blackhole_ops);
4575         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4576 }