]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: add key length check into rt6_select()
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146
147                 spin_lock_bh(&ul->lock);
148                 list_del(&rt->rt6i_uncached);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
186 {
187         return dst_metrics_write_ptr(rt->dst.from);
188 }
189
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
191 {
192         struct rt6_info *rt = (struct rt6_info *)dst;
193
194         if (rt->rt6i_flags & RTF_PCPU)
195                 return rt6_pcpu_cow_metrics(rt);
196         else if (rt->rt6i_flags & RTF_CACHE)
197                 return NULL;
198         else
199                 return dst_cow_metrics_generic(dst, old);
200 }
201
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203                                              struct sk_buff *skb,
204                                              const void *daddr)
205 {
206         struct in6_addr *p = &rt->rt6i_gateway;
207
208         if (!ipv6_addr_any(p))
209                 return (const void *) p;
210         else if (skb)
211                 return &ipv6_hdr(skb)->daddr;
212         return daddr;
213 }
214
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216                                           struct sk_buff *skb,
217                                           const void *daddr)
218 {
219         struct rt6_info *rt = (struct rt6_info *) dst;
220         struct neighbour *n;
221
222         daddr = choose_neigh_daddr(rt, skb, daddr);
223         n = __ipv6_neigh_lookup(dst->dev, daddr);
224         if (n)
225                 return n;
226         return neigh_create(&nd_tbl, daddr, dst->dev);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(rt, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       ipv6_cow_metrics,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu)
272 {
273 }
274
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276                                       struct sk_buff *skb)
277 {
278 }
279
280 static struct dst_ops ip6_dst_blackhole_ops = {
281         .family                 =       AF_INET6,
282         .destroy                =       ip6_dst_destroy,
283         .check                  =       ip6_dst_check,
284         .mtu                    =       ip6_blackhole_mtu,
285         .default_advmss         =       ip6_default_advmss,
286         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
287         .redirect               =       ip6_rt_blackhole_redirect,
288         .cow_metrics            =       dst_cow_metrics_generic,
289         .neigh_lookup           =       ip6_neigh_lookup,
290 };
291
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293         [RTAX_HOPLIMIT - 1] = 0,
294 };
295
296 static const struct rt6_info ip6_null_entry_template = {
297         .dst = {
298                 .__refcnt       = ATOMIC_INIT(1),
299                 .__use          = 1,
300                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
301                 .error          = -ENETUNREACH,
302                 .input          = ip6_pkt_discard,
303                 .output         = ip6_pkt_discard_out,
304         },
305         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
306         .rt6i_protocol  = RTPROT_KERNEL,
307         .rt6i_metric    = ~(u32) 0,
308         .rt6i_ref       = ATOMIC_INIT(1),
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326 };
327
328 static const struct rt6_info ip6_blk_hole_entry_template = {
329         .dst = {
330                 .__refcnt       = ATOMIC_INIT(1),
331                 .__use          = 1,
332                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
333                 .error          = -EINVAL,
334                 .input          = dst_discard,
335                 .output         = dst_discard_out,
336         },
337         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
338         .rt6i_protocol  = RTPROT_KERNEL,
339         .rt6i_metric    = ~(u32) 0,
340         .rt6i_ref       = ATOMIC_INIT(1),
341 };
342
343 #endif
344
345 static void rt6_info_init(struct rt6_info *rt)
346 {
347         struct dst_entry *dst = &rt->dst;
348
349         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350         INIT_LIST_HEAD(&rt->rt6i_siblings);
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356                                         struct net_device *dev,
357                                         int flags)
358 {
359         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360                                         1, DST_OBSOLETE_FORCE_CHK, flags);
361
362         if (rt)
363                 rt6_info_init(rt);
364
365         return rt;
366 }
367
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369                                struct net_device *dev,
370                                int flags)
371 {
372         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373
374         if (rt) {
375                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
376                 if (rt->rt6i_pcpu) {
377                         int cpu;
378
379                         for_each_possible_cpu(cpu) {
380                                 struct rt6_info **p;
381
382                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383                                 /* no one shares rt */
384                                 *p =  NULL;
385                         }
386                 } else {
387                         dst_release_immediate(&rt->dst);
388                         return NULL;
389                 }
390         }
391
392         return rt;
393 }
394 EXPORT_SYMBOL(ip6_dst_alloc);
395
396 static void ip6_dst_destroy(struct dst_entry *dst)
397 {
398         struct rt6_info *rt = (struct rt6_info *)dst;
399         struct rt6_exception_bucket *bucket;
400         struct dst_entry *from = dst->from;
401         struct inet6_dev *idev;
402
403         dst_destroy_metrics_generic(dst);
404         free_percpu(rt->rt6i_pcpu);
405         rt6_uncached_list_del(rt);
406
407         idev = rt->rt6i_idev;
408         if (idev) {
409                 rt->rt6i_idev = NULL;
410                 in6_dev_put(idev);
411         }
412         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
413         if (bucket) {
414                 rt->rt6i_exception_bucket = NULL;
415                 kfree(bucket);
416         }
417
418         dst->from = NULL;
419         dst_release(from);
420 }
421
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
423                            int how)
424 {
425         struct rt6_info *rt = (struct rt6_info *)dst;
426         struct inet6_dev *idev = rt->rt6i_idev;
427         struct net_device *loopback_dev =
428                 dev_net(dev)->loopback_dev;
429
430         if (idev && idev->dev != loopback_dev) {
431                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
432                 if (loopback_idev) {
433                         rt->rt6i_idev = loopback_idev;
434                         in6_dev_put(idev);
435                 }
436         }
437 }
438
439 static bool __rt6_check_expired(const struct rt6_info *rt)
440 {
441         if (rt->rt6i_flags & RTF_EXPIRES)
442                 return time_after(jiffies, rt->dst.expires);
443         else
444                 return false;
445 }
446
447 static bool rt6_check_expired(const struct rt6_info *rt)
448 {
449         if (rt->rt6i_flags & RTF_EXPIRES) {
450                 if (time_after(jiffies, rt->dst.expires))
451                         return true;
452         } else if (rt->dst.from) {
453                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454                        rt6_check_expired((struct rt6_info *)rt->dst.from);
455         }
456         return false;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         /* We might have already computed the hash for ICMPv6 errors. In such
467          * case it will always be non-zero. Otherwise now is the time to do it.
468          */
469         if (!fl6->mp_hash)
470                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
471
472         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473         /* Don't change the route, if route_choosen == 0
474          * (siblings does not include ourself)
475          */
476         if (route_choosen)
477                 list_for_each_entry_safe(sibling, next_sibling,
478                                 &match->rt6i_siblings, rt6i_siblings) {
479                         route_choosen--;
480                         if (route_choosen == 0) {
481                                 if (rt6_score_route(sibling, oif, strict) < 0)
482                                         break;
483                                 match = sibling;
484                                 break;
485                         }
486                 }
487         return match;
488 }
489
490 /*
491  *      Route lookup. Any table->tb6_lock is implied.
492  */
493
494 static inline struct rt6_info *rt6_device_match(struct net *net,
495                                                     struct rt6_info *rt,
496                                                     const struct in6_addr *saddr,
497                                                     int oif,
498                                                     int flags)
499 {
500         struct rt6_info *local = NULL;
501         struct rt6_info *sprt;
502
503         if (!oif && ipv6_addr_any(saddr))
504                 goto out;
505
506         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507                 struct net_device *dev = sprt->dst.dev;
508
509                 if (oif) {
510                         if (dev->ifindex == oif)
511                                 return sprt;
512                         if (dev->flags & IFF_LOOPBACK) {
513                                 if (!sprt->rt6i_idev ||
514                                     sprt->rt6i_idev->dev->ifindex != oif) {
515                                         if (flags & RT6_LOOKUP_F_IFACE)
516                                                 continue;
517                                         if (local &&
518                                             local->rt6i_idev->dev->ifindex == oif)
519                                                 continue;
520                                 }
521                                 local = sprt;
522                         }
523                 } else {
524                         if (ipv6_chk_addr(net, saddr, dev,
525                                           flags & RT6_LOOKUP_F_IFACE))
526                                 return sprt;
527                 }
528         }
529
530         if (oif) {
531                 if (local)
532                         return local;
533
534                 if (flags & RT6_LOOKUP_F_IFACE)
535                         return net->ipv6.ip6_null_entry;
536         }
537 out:
538         return rt;
539 }
540
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543         struct work_struct work;
544         struct in6_addr target;
545         struct net_device *dev;
546 };
547
548 static void rt6_probe_deferred(struct work_struct *w)
549 {
550         struct in6_addr mcaddr;
551         struct __rt6_probe_work *work =
552                 container_of(w, struct __rt6_probe_work, work);
553
554         addrconf_addr_solict_mult(&work->target, &mcaddr);
555         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556         dev_put(work->dev);
557         kfree(work);
558 }
559
560 static void rt6_probe(struct rt6_info *rt)
561 {
562         struct __rt6_probe_work *work;
563         struct neighbour *neigh;
564         /*
565          * Okay, this does not seem to be appropriate
566          * for now, however, we need to check if it
567          * is really so; aka Router Reachability Probing.
568          *
569          * Router Reachability Probe MUST be rate-limited
570          * to no more than one per minute.
571          */
572         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
573                 return;
574         rcu_read_lock_bh();
575         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
576         if (neigh) {
577                 if (neigh->nud_state & NUD_VALID)
578                         goto out;
579
580                 work = NULL;
581                 write_lock(&neigh->lock);
582                 if (!(neigh->nud_state & NUD_VALID) &&
583                     time_after(jiffies,
584                                neigh->updated +
585                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
586                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
587                         if (work)
588                                 __neigh_set_probe_once(neigh);
589                 }
590                 write_unlock(&neigh->lock);
591         } else {
592                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
593         }
594
595         if (work) {
596                 INIT_WORK(&work->work, rt6_probe_deferred);
597                 work->target = rt->rt6i_gateway;
598                 dev_hold(rt->dst.dev);
599                 work->dev = rt->dst.dev;
600                 schedule_work(&work->work);
601         }
602
603 out:
604         rcu_read_unlock_bh();
605 }
606 #else
607 static inline void rt6_probe(struct rt6_info *rt)
608 {
609 }
610 #endif
611
612 /*
613  * Default Router Selection (RFC 2461 6.3.6)
614  */
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
616 {
617         struct net_device *dev = rt->dst.dev;
618         if (!oif || dev->ifindex == oif)
619                 return 2;
620         if ((dev->flags & IFF_LOOPBACK) &&
621             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622                 return 1;
623         return 0;
624 }
625
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
627 {
628         struct neighbour *neigh;
629         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
630
631         if (rt->rt6i_flags & RTF_NONEXTHOP ||
632             !(rt->rt6i_flags & RTF_GATEWAY))
633                 return RT6_NUD_SUCCEED;
634
635         rcu_read_lock_bh();
636         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
637         if (neigh) {
638                 read_lock(&neigh->lock);
639                 if (neigh->nud_state & NUD_VALID)
640                         ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642                 else if (!(neigh->nud_state & NUD_FAILED))
643                         ret = RT6_NUD_SUCCEED;
644                 else
645                         ret = RT6_NUD_FAIL_PROBE;
646 #endif
647                 read_unlock(&neigh->lock);
648         } else {
649                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
651         }
652         rcu_read_unlock_bh();
653
654         return ret;
655 }
656
657 static int rt6_score_route(struct rt6_info *rt, int oif,
658                            int strict)
659 {
660         int m;
661
662         m = rt6_check_dev(rt, oif);
663         if (!m && (strict & RT6_LOOKUP_F_IFACE))
664                 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
667 #endif
668         if (strict & RT6_LOOKUP_F_REACHABLE) {
669                 int n = rt6_check_neigh(rt);
670                 if (n < 0)
671                         return n;
672         }
673         return m;
674 }
675
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677                                    int *mpri, struct rt6_info *match,
678                                    bool *do_rr)
679 {
680         int m;
681         bool match_do_rr = false;
682         struct inet6_dev *idev = rt->rt6i_idev;
683         struct net_device *dev = rt->dst.dev;
684
685         if (dev && !netif_carrier_ok(dev) &&
686             idev->cnf.ignore_routes_with_linkdown &&
687             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688                 goto out;
689
690         if (rt6_check_expired(rt))
691                 goto out;
692
693         m = rt6_score_route(rt, oif, strict);
694         if (m == RT6_NUD_FAIL_DO_RR) {
695                 match_do_rr = true;
696                 m = 0; /* lowest valid score */
697         } else if (m == RT6_NUD_FAIL_HARD) {
698                 goto out;
699         }
700
701         if (strict & RT6_LOOKUP_F_REACHABLE)
702                 rt6_probe(rt);
703
704         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
705         if (m > *mpri) {
706                 *do_rr = match_do_rr;
707                 *mpri = m;
708                 match = rt;
709         }
710 out:
711         return match;
712 }
713
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715                                      struct rt6_info *leaf,
716                                      struct rt6_info *rr_head,
717                                      u32 metric, int oif, int strict,
718                                      bool *do_rr)
719 {
720         struct rt6_info *rt, *match, *cont;
721         int mpri = -1;
722
723         match = NULL;
724         cont = NULL;
725         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
726                 if (rt->rt6i_metric != metric) {
727                         cont = rt;
728                         break;
729                 }
730
731                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732         }
733
734         for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
735                 if (rt->rt6i_metric != metric) {
736                         cont = rt;
737                         break;
738                 }
739
740                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741         }
742
743         if (match || !cont)
744                 return match;
745
746         for (rt = cont; rt; rt = rt->dst.rt6_next)
747                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
748
749         return match;
750 }
751
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
753                                    int oif, int strict)
754 {
755         struct rt6_info *leaf = fn->leaf;
756         struct rt6_info *match, *rt0;
757         bool do_rr = false;
758         int key_plen;
759
760         if (!leaf)
761                 return net->ipv6.ip6_null_entry;
762
763         rt0 = fn->rr_ptr;
764         if (!rt0)
765                 fn->rr_ptr = rt0 = leaf;
766
767         /* Double check to make sure fn is not an intermediate node
768          * and fn->leaf does not points to its child's leaf
769          * (This might happen if all routes under fn are deleted from
770          * the tree and fib6_repair_tree() is called on the node.)
771          */
772         key_plen = rt0->rt6i_dst.plen;
773 #ifdef CONFIG_IPV6_SUBTREES
774         if (rt0->rt6i_src.plen)
775                 key_plen = rt0->rt6i_src.plen;
776 #endif
777         if (fn->fn_bit != key_plen)
778                 return net->ipv6.ip6_null_entry;
779
780         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
781                              &do_rr);
782
783         if (do_rr) {
784                 struct rt6_info *next = rt0->dst.rt6_next;
785
786                 /* no entries matched; do round-robin */
787                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
788                         next = leaf;
789
790                 if (next != rt0)
791                         fn->rr_ptr = next;
792         }
793
794         return match ? match : net->ipv6.ip6_null_entry;
795 }
796
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
798 {
799         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
800 }
801
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804                   const struct in6_addr *gwaddr)
805 {
806         struct net *net = dev_net(dev);
807         struct route_info *rinfo = (struct route_info *) opt;
808         struct in6_addr prefix_buf, *prefix;
809         unsigned int pref;
810         unsigned long lifetime;
811         struct rt6_info *rt;
812
813         if (len < sizeof(struct route_info)) {
814                 return -EINVAL;
815         }
816
817         /* Sanity check for prefix_len and length */
818         if (rinfo->length > 3) {
819                 return -EINVAL;
820         } else if (rinfo->prefix_len > 128) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 64) {
823                 if (rinfo->length < 2) {
824                         return -EINVAL;
825                 }
826         } else if (rinfo->prefix_len > 0) {
827                 if (rinfo->length < 1) {
828                         return -EINVAL;
829                 }
830         }
831
832         pref = rinfo->route_pref;
833         if (pref == ICMPV6_ROUTER_PREF_INVALID)
834                 return -EINVAL;
835
836         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
837
838         if (rinfo->length == 3)
839                 prefix = (struct in6_addr *)rinfo->prefix;
840         else {
841                 /* this function is safe */
842                 ipv6_addr_prefix(&prefix_buf,
843                                  (struct in6_addr *)rinfo->prefix,
844                                  rinfo->prefix_len);
845                 prefix = &prefix_buf;
846         }
847
848         if (rinfo->prefix_len == 0)
849                 rt = rt6_get_dflt_router(gwaddr, dev);
850         else
851                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
852                                         gwaddr, dev);
853
854         if (rt && !lifetime) {
855                 ip6_del_rt(rt);
856                 rt = NULL;
857         }
858
859         if (!rt && lifetime)
860                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
861                                         dev, pref);
862         else if (rt)
863                 rt->rt6i_flags = RTF_ROUTEINFO |
864                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
865
866         if (rt) {
867                 if (!addrconf_finite_timeout(lifetime))
868                         rt6_clean_expires(rt);
869                 else
870                         rt6_set_expires(rt, jiffies + HZ * lifetime);
871
872                 ip6_rt_put(rt);
873         }
874         return 0;
875 }
876 #endif
877
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879                                         struct in6_addr *saddr)
880 {
881         struct fib6_node *pn;
882         while (1) {
883                 if (fn->fn_flags & RTN_TL_ROOT)
884                         return NULL;
885                 pn = fn->parent;
886                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
887                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
888                 else
889                         fn = pn;
890                 if (fn->fn_flags & RTN_RTINFO)
891                         return fn;
892         }
893 }
894
895 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
896                           bool null_fallback)
897 {
898         struct rt6_info *rt = *prt;
899
900         if (dst_hold_safe(&rt->dst))
901                 return true;
902         if (null_fallback) {
903                 rt = net->ipv6.ip6_null_entry;
904                 dst_hold(&rt->dst);
905         } else {
906                 rt = NULL;
907         }
908         *prt = rt;
909         return false;
910 }
911
912 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
913                                              struct fib6_table *table,
914                                              struct flowi6 *fl6, int flags)
915 {
916         struct rt6_info *rt, *rt_cache;
917         struct fib6_node *fn;
918
919         read_lock_bh(&table->tb6_lock);
920         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
921 restart:
922         rt = fn->leaf;
923         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
924         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
925                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
926         if (rt == net->ipv6.ip6_null_entry) {
927                 fn = fib6_backtrack(fn, &fl6->saddr);
928                 if (fn)
929                         goto restart;
930         }
931         /* Search through exception table */
932         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
933         if (rt_cache)
934                 rt = rt_cache;
935
936         if (ip6_hold_safe(net, &rt, true))
937                 dst_use_noref(&rt->dst, jiffies);
938
939         read_unlock_bh(&table->tb6_lock);
940
941         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
942
943         return rt;
944
945 }
946
947 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
948                                     int flags)
949 {
950         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
951 }
952 EXPORT_SYMBOL_GPL(ip6_route_lookup);
953
954 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
955                             const struct in6_addr *saddr, int oif, int strict)
956 {
957         struct flowi6 fl6 = {
958                 .flowi6_oif = oif,
959                 .daddr = *daddr,
960         };
961         struct dst_entry *dst;
962         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
963
964         if (saddr) {
965                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
966                 flags |= RT6_LOOKUP_F_HAS_SADDR;
967         }
968
969         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
970         if (dst->error == 0)
971                 return (struct rt6_info *) dst;
972
973         dst_release(dst);
974
975         return NULL;
976 }
977 EXPORT_SYMBOL(rt6_lookup);
978
979 /* ip6_ins_rt is called with FREE table->tb6_lock.
980  * It takes new route entry, the addition fails by any reason the
981  * route is released.
982  * Caller must hold dst before calling it.
983  */
984
985 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
986                         struct mx6_config *mxc,
987                         struct netlink_ext_ack *extack)
988 {
989         int err;
990         struct fib6_table *table;
991
992         table = rt->rt6i_table;
993         write_lock_bh(&table->tb6_lock);
994         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
995         write_unlock_bh(&table->tb6_lock);
996
997         return err;
998 }
999
1000 int ip6_ins_rt(struct rt6_info *rt)
1001 {
1002         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1003         struct mx6_config mxc = { .mx = NULL, };
1004
1005         /* Hold dst to account for the reference from the fib6 tree */
1006         dst_hold(&rt->dst);
1007         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1008 }
1009
1010 /* called with rcu_lock held */
1011 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1012 {
1013         struct net_device *dev = rt->dst.dev;
1014
1015         if (rt->rt6i_flags & RTF_LOCAL) {
1016                 /* for copies of local routes, dst->dev needs to be the
1017                  * device if it is a master device, the master device if
1018                  * device is enslaved, and the loopback as the default
1019                  */
1020                 if (netif_is_l3_slave(dev) &&
1021                     !rt6_need_strict(&rt->rt6i_dst.addr))
1022                         dev = l3mdev_master_dev_rcu(dev);
1023                 else if (!netif_is_l3_master(dev))
1024                         dev = dev_net(dev)->loopback_dev;
1025                 /* last case is netif_is_l3_master(dev) is true in which
1026                  * case we want dev returned to be dev
1027                  */
1028         }
1029
1030         return dev;
1031 }
1032
1033 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1034                                            const struct in6_addr *daddr,
1035                                            const struct in6_addr *saddr)
1036 {
1037         struct net_device *dev;
1038         struct rt6_info *rt;
1039
1040         /*
1041          *      Clone the route.
1042          */
1043
1044         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1045                 ort = (struct rt6_info *)ort->dst.from;
1046
1047         rcu_read_lock();
1048         dev = ip6_rt_get_dev_rcu(ort);
1049         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1050         rcu_read_unlock();
1051         if (!rt)
1052                 return NULL;
1053
1054         ip6_rt_copy_init(rt, ort);
1055         rt->rt6i_flags |= RTF_CACHE;
1056         rt->rt6i_metric = 0;
1057         rt->dst.flags |= DST_HOST;
1058         rt->rt6i_dst.addr = *daddr;
1059         rt->rt6i_dst.plen = 128;
1060
1061         if (!rt6_is_gw_or_nonexthop(ort)) {
1062                 if (ort->rt6i_dst.plen != 128 &&
1063                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1064                         rt->rt6i_flags |= RTF_ANYCAST;
1065 #ifdef CONFIG_IPV6_SUBTREES
1066                 if (rt->rt6i_src.plen && saddr) {
1067                         rt->rt6i_src.addr = *saddr;
1068                         rt->rt6i_src.plen = 128;
1069                 }
1070 #endif
1071         }
1072
1073         return rt;
1074 }
1075
1076 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1077 {
1078         struct net_device *dev;
1079         struct rt6_info *pcpu_rt;
1080
1081         rcu_read_lock();
1082         dev = ip6_rt_get_dev_rcu(rt);
1083         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1084         rcu_read_unlock();
1085         if (!pcpu_rt)
1086                 return NULL;
1087         ip6_rt_copy_init(pcpu_rt, rt);
1088         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1089         pcpu_rt->rt6i_flags |= RTF_PCPU;
1090         return pcpu_rt;
1091 }
1092
1093 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1094 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1095 {
1096         struct rt6_info *pcpu_rt, **p;
1097
1098         p = this_cpu_ptr(rt->rt6i_pcpu);
1099         pcpu_rt = *p;
1100
1101         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1102                 rt6_dst_from_metrics_check(pcpu_rt);
1103
1104         return pcpu_rt;
1105 }
1106
1107 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1108 {
1109         struct rt6_info *pcpu_rt, *prev, **p;
1110
1111         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1112         if (!pcpu_rt) {
1113                 struct net *net = dev_net(rt->dst.dev);
1114
1115                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1116                 return net->ipv6.ip6_null_entry;
1117         }
1118
1119         dst_hold(&pcpu_rt->dst);
1120         p = this_cpu_ptr(rt->rt6i_pcpu);
1121         prev = cmpxchg(p, NULL, pcpu_rt);
1122         if (prev) {
1123                 /* If someone did it before us, return prev instead */
1124                 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1125                 dst_release_immediate(&pcpu_rt->dst);
1126                 /* release refcnt taken by above dst_hold() */
1127                 dst_release_immediate(&pcpu_rt->dst);
1128                 dst_hold(&prev->dst);
1129                 pcpu_rt = prev;
1130         }
1131
1132         rt6_dst_from_metrics_check(pcpu_rt);
1133         return pcpu_rt;
1134 }
1135
1136 /* exception hash table implementation
1137  */
1138 static DEFINE_SPINLOCK(rt6_exception_lock);
1139
1140 /* Remove rt6_ex from hash table and free the memory
1141  * Caller must hold rt6_exception_lock
1142  */
1143 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1144                                  struct rt6_exception *rt6_ex)
1145 {
1146         if (!bucket || !rt6_ex)
1147                 return;
1148         rt6_ex->rt6i->rt6i_node = NULL;
1149         hlist_del_rcu(&rt6_ex->hlist);
1150         rt6_release(rt6_ex->rt6i);
1151         kfree_rcu(rt6_ex, rcu);
1152         WARN_ON_ONCE(!bucket->depth);
1153         bucket->depth--;
1154 }
1155
1156 /* Remove oldest rt6_ex in bucket and free the memory
1157  * Caller must hold rt6_exception_lock
1158  */
1159 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1160 {
1161         struct rt6_exception *rt6_ex, *oldest = NULL;
1162
1163         if (!bucket)
1164                 return;
1165
1166         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1167                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1168                         oldest = rt6_ex;
1169         }
1170         rt6_remove_exception(bucket, oldest);
1171 }
1172
1173 static u32 rt6_exception_hash(const struct in6_addr *dst,
1174                               const struct in6_addr *src)
1175 {
1176         static u32 seed __read_mostly;
1177         u32 val;
1178
1179         net_get_random_once(&seed, sizeof(seed));
1180         val = jhash(dst, sizeof(*dst), seed);
1181
1182 #ifdef CONFIG_IPV6_SUBTREES
1183         if (src)
1184                 val = jhash(src, sizeof(*src), val);
1185 #endif
1186         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1187 }
1188
1189 /* Helper function to find the cached rt in the hash table
1190  * and update bucket pointer to point to the bucket for this
1191  * (daddr, saddr) pair
1192  * Caller must hold rt6_exception_lock
1193  */
1194 static struct rt6_exception *
1195 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1196                               const struct in6_addr *daddr,
1197                               const struct in6_addr *saddr)
1198 {
1199         struct rt6_exception *rt6_ex;
1200         u32 hval;
1201
1202         if (!(*bucket) || !daddr)
1203                 return NULL;
1204
1205         hval = rt6_exception_hash(daddr, saddr);
1206         *bucket += hval;
1207
1208         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1209                 struct rt6_info *rt6 = rt6_ex->rt6i;
1210                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1211
1212 #ifdef CONFIG_IPV6_SUBTREES
1213                 if (matched && saddr)
1214                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1215 #endif
1216                 if (matched)
1217                         return rt6_ex;
1218         }
1219         return NULL;
1220 }
1221
1222 /* Helper function to find the cached rt in the hash table
1223  * and update bucket pointer to point to the bucket for this
1224  * (daddr, saddr) pair
1225  * Caller must hold rcu_read_lock()
1226  */
1227 static struct rt6_exception *
1228 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1229                          const struct in6_addr *daddr,
1230                          const struct in6_addr *saddr)
1231 {
1232         struct rt6_exception *rt6_ex;
1233         u32 hval;
1234
1235         WARN_ON_ONCE(!rcu_read_lock_held());
1236
1237         if (!(*bucket) || !daddr)
1238                 return NULL;
1239
1240         hval = rt6_exception_hash(daddr, saddr);
1241         *bucket += hval;
1242
1243         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1244                 struct rt6_info *rt6 = rt6_ex->rt6i;
1245                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1246
1247 #ifdef CONFIG_IPV6_SUBTREES
1248                 if (matched && saddr)
1249                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1250 #endif
1251                 if (matched)
1252                         return rt6_ex;
1253         }
1254         return NULL;
1255 }
1256
1257 static int rt6_insert_exception(struct rt6_info *nrt,
1258                                 struct rt6_info *ort)
1259 {
1260         struct rt6_exception_bucket *bucket;
1261         struct in6_addr *src_key = NULL;
1262         struct rt6_exception *rt6_ex;
1263         int err = 0;
1264
1265         /* ort can't be a cache or pcpu route */
1266         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1267                 ort = (struct rt6_info *)ort->dst.from;
1268         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1269
1270         spin_lock_bh(&rt6_exception_lock);
1271
1272         if (ort->exception_bucket_flushed) {
1273                 err = -EINVAL;
1274                 goto out;
1275         }
1276
1277         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1278                                         lockdep_is_held(&rt6_exception_lock));
1279         if (!bucket) {
1280                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1281                                  GFP_ATOMIC);
1282                 if (!bucket) {
1283                         err = -ENOMEM;
1284                         goto out;
1285                 }
1286                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1287         }
1288
1289 #ifdef CONFIG_IPV6_SUBTREES
1290         /* rt6i_src.plen != 0 indicates ort is in subtree
1291          * and exception table is indexed by a hash of
1292          * both rt6i_dst and rt6i_src.
1293          * Otherwise, the exception table is indexed by
1294          * a hash of only rt6i_dst.
1295          */
1296         if (ort->rt6i_src.plen)
1297                 src_key = &nrt->rt6i_src.addr;
1298 #endif
1299
1300         /* Update rt6i_prefsrc as it could be changed
1301          * in rt6_remove_prefsrc()
1302          */
1303         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1304         /* rt6_mtu_change() might lower mtu on ort.
1305          * Only insert this exception route if its mtu
1306          * is less than ort's mtu value.
1307          */
1308         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1309                 err = -EINVAL;
1310                 goto out;
1311         }
1312
1313         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1314                                                src_key);
1315         if (rt6_ex)
1316                 rt6_remove_exception(bucket, rt6_ex);
1317
1318         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1319         if (!rt6_ex) {
1320                 err = -ENOMEM;
1321                 goto out;
1322         }
1323         rt6_ex->rt6i = nrt;
1324         rt6_ex->stamp = jiffies;
1325         atomic_inc(&nrt->rt6i_ref);
1326         nrt->rt6i_node = ort->rt6i_node;
1327         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1328         bucket->depth++;
1329
1330         if (bucket->depth > FIB6_MAX_DEPTH)
1331                 rt6_exception_remove_oldest(bucket);
1332
1333 out:
1334         spin_unlock_bh(&rt6_exception_lock);
1335
1336         /* Update fn->fn_sernum to invalidate all cached dst */
1337         if (!err)
1338                 fib6_update_sernum(ort);
1339
1340         return err;
1341 }
1342
1343 void rt6_flush_exceptions(struct rt6_info *rt)
1344 {
1345         struct rt6_exception_bucket *bucket;
1346         struct rt6_exception *rt6_ex;
1347         struct hlist_node *tmp;
1348         int i;
1349
1350         spin_lock_bh(&rt6_exception_lock);
1351         /* Prevent rt6_insert_exception() to recreate the bucket list */
1352         rt->exception_bucket_flushed = 1;
1353
1354         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1355                                     lockdep_is_held(&rt6_exception_lock));
1356         if (!bucket)
1357                 goto out;
1358
1359         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1360                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1361                         rt6_remove_exception(bucket, rt6_ex);
1362                 WARN_ON_ONCE(bucket->depth);
1363                 bucket++;
1364         }
1365
1366 out:
1367         spin_unlock_bh(&rt6_exception_lock);
1368 }
1369
1370 /* Find cached rt in the hash table inside passed in rt
1371  * Caller has to hold rcu_read_lock()
1372  */
1373 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1374                                            struct in6_addr *daddr,
1375                                            struct in6_addr *saddr)
1376 {
1377         struct rt6_exception_bucket *bucket;
1378         struct in6_addr *src_key = NULL;
1379         struct rt6_exception *rt6_ex;
1380         struct rt6_info *res = NULL;
1381
1382         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1383
1384 #ifdef CONFIG_IPV6_SUBTREES
1385         /* rt6i_src.plen != 0 indicates rt is in subtree
1386          * and exception table is indexed by a hash of
1387          * both rt6i_dst and rt6i_src.
1388          * Otherwise, the exception table is indexed by
1389          * a hash of only rt6i_dst.
1390          */
1391         if (rt->rt6i_src.plen)
1392                 src_key = saddr;
1393 #endif
1394         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1395
1396         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1397                 res = rt6_ex->rt6i;
1398
1399         return res;
1400 }
1401
1402 /* Remove the passed in cached rt from the hash table that contains it */
1403 int rt6_remove_exception_rt(struct rt6_info *rt)
1404 {
1405         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1406         struct rt6_exception_bucket *bucket;
1407         struct in6_addr *src_key = NULL;
1408         struct rt6_exception *rt6_ex;
1409         int err;
1410
1411         if (!from ||
1412             !(rt->rt6i_flags | RTF_CACHE))
1413                 return -EINVAL;
1414
1415         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1416                 return -ENOENT;
1417
1418         spin_lock_bh(&rt6_exception_lock);
1419         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1420                                     lockdep_is_held(&rt6_exception_lock));
1421 #ifdef CONFIG_IPV6_SUBTREES
1422         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1423          * and exception table is indexed by a hash of
1424          * both rt6i_dst and rt6i_src.
1425          * Otherwise, the exception table is indexed by
1426          * a hash of only rt6i_dst.
1427          */
1428         if (from->rt6i_src.plen)
1429                 src_key = &rt->rt6i_src.addr;
1430 #endif
1431         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1432                                                &rt->rt6i_dst.addr,
1433                                                src_key);
1434         if (rt6_ex) {
1435                 rt6_remove_exception(bucket, rt6_ex);
1436                 err = 0;
1437         } else {
1438                 err = -ENOENT;
1439         }
1440
1441         spin_unlock_bh(&rt6_exception_lock);
1442         return err;
1443 }
1444
1445 /* Find rt6_ex which contains the passed in rt cache and
1446  * refresh its stamp
1447  */
1448 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1449 {
1450         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1451         struct rt6_exception_bucket *bucket;
1452         struct in6_addr *src_key = NULL;
1453         struct rt6_exception *rt6_ex;
1454
1455         if (!from ||
1456             !(rt->rt6i_flags | RTF_CACHE))
1457                 return;
1458
1459         rcu_read_lock();
1460         bucket = rcu_dereference(from->rt6i_exception_bucket);
1461
1462 #ifdef CONFIG_IPV6_SUBTREES
1463         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1464          * and exception table is indexed by a hash of
1465          * both rt6i_dst and rt6i_src.
1466          * Otherwise, the exception table is indexed by
1467          * a hash of only rt6i_dst.
1468          */
1469         if (from->rt6i_src.plen)
1470                 src_key = &rt->rt6i_src.addr;
1471 #endif
1472         rt6_ex = __rt6_find_exception_rcu(&bucket,
1473                                           &rt->rt6i_dst.addr,
1474                                           src_key);
1475         if (rt6_ex)
1476                 rt6_ex->stamp = jiffies;
1477
1478         rcu_read_unlock();
1479 }
1480
1481 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1482 {
1483         struct rt6_exception_bucket *bucket;
1484         struct rt6_exception *rt6_ex;
1485         int i;
1486
1487         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1488                                         lockdep_is_held(&rt6_exception_lock));
1489
1490         if (bucket) {
1491                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1492                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1493                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1494                         }
1495                         bucket++;
1496                 }
1497         }
1498 }
1499
1500 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1501 {
1502         struct rt6_exception_bucket *bucket;
1503         struct rt6_exception *rt6_ex;
1504         int i;
1505
1506         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1507                                         lockdep_is_held(&rt6_exception_lock));
1508
1509         if (bucket) {
1510                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1511                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1512                                 struct rt6_info *entry = rt6_ex->rt6i;
1513                                 /* For RTF_CACHE with rt6i_pmtu == 0
1514                                  * (i.e. a redirected route),
1515                                  * the metrics of its rt->dst.from has already
1516                                  * been updated.
1517                                  */
1518                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1519                                         entry->rt6i_pmtu = mtu;
1520                         }
1521                         bucket++;
1522                 }
1523         }
1524 }
1525
1526 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1527
1528 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1529                                         struct in6_addr *gateway)
1530 {
1531         struct rt6_exception_bucket *bucket;
1532         struct rt6_exception *rt6_ex;
1533         struct hlist_node *tmp;
1534         int i;
1535
1536         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1537                 return;
1538
1539         spin_lock_bh(&rt6_exception_lock);
1540         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1541                                      lockdep_is_held(&rt6_exception_lock));
1542
1543         if (bucket) {
1544                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1545                         hlist_for_each_entry_safe(rt6_ex, tmp,
1546                                                   &bucket->chain, hlist) {
1547                                 struct rt6_info *entry = rt6_ex->rt6i;
1548
1549                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1550                                     RTF_CACHE_GATEWAY &&
1551                                     ipv6_addr_equal(gateway,
1552                                                     &entry->rt6i_gateway)) {
1553                                         rt6_remove_exception(bucket, rt6_ex);
1554                                 }
1555                         }
1556                         bucket++;
1557                 }
1558         }
1559
1560         spin_unlock_bh(&rt6_exception_lock);
1561 }
1562
1563 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1564                                       struct rt6_exception *rt6_ex,
1565                                       struct fib6_gc_args *gc_args,
1566                                       unsigned long now)
1567 {
1568         struct rt6_info *rt = rt6_ex->rt6i;
1569
1570         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1571             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1572                 RT6_TRACE("aging clone %p\n", rt);
1573                 rt6_remove_exception(bucket, rt6_ex);
1574                 return;
1575         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1576                 struct neighbour *neigh;
1577                 __u8 neigh_flags = 0;
1578
1579                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1580                 if (neigh) {
1581                         neigh_flags = neigh->flags;
1582                         neigh_release(neigh);
1583                 }
1584                 if (!(neigh_flags & NTF_ROUTER)) {
1585                         RT6_TRACE("purging route %p via non-router but gateway\n",
1586                                   rt);
1587                         rt6_remove_exception(bucket, rt6_ex);
1588                         return;
1589                 }
1590         }
1591         gc_args->more++;
1592 }
1593
1594 void rt6_age_exceptions(struct rt6_info *rt,
1595                         struct fib6_gc_args *gc_args,
1596                         unsigned long now)
1597 {
1598         struct rt6_exception_bucket *bucket;
1599         struct rt6_exception *rt6_ex;
1600         struct hlist_node *tmp;
1601         int i;
1602
1603         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1604                 return;
1605
1606         spin_lock_bh(&rt6_exception_lock);
1607         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1608                                     lockdep_is_held(&rt6_exception_lock));
1609
1610         if (bucket) {
1611                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1612                         hlist_for_each_entry_safe(rt6_ex, tmp,
1613                                                   &bucket->chain, hlist) {
1614                                 rt6_age_examine_exception(bucket, rt6_ex,
1615                                                           gc_args, now);
1616                         }
1617                         bucket++;
1618                 }
1619         }
1620         spin_unlock_bh(&rt6_exception_lock);
1621 }
1622
1623 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1624                                int oif, struct flowi6 *fl6, int flags)
1625 {
1626         struct fib6_node *fn, *saved_fn;
1627         struct rt6_info *rt, *rt_cache;
1628         int strict = 0;
1629
1630         strict |= flags & RT6_LOOKUP_F_IFACE;
1631         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1632         if (net->ipv6.devconf_all->forwarding == 0)
1633                 strict |= RT6_LOOKUP_F_REACHABLE;
1634
1635         read_lock_bh(&table->tb6_lock);
1636
1637         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1638         saved_fn = fn;
1639
1640         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1641                 oif = 0;
1642
1643 redo_rt6_select:
1644         rt = rt6_select(net, fn, oif, strict);
1645         if (rt->rt6i_nsiblings)
1646                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1647         if (rt == net->ipv6.ip6_null_entry) {
1648                 fn = fib6_backtrack(fn, &fl6->saddr);
1649                 if (fn)
1650                         goto redo_rt6_select;
1651                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1652                         /* also consider unreachable route */
1653                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1654                         fn = saved_fn;
1655                         goto redo_rt6_select;
1656                 }
1657         }
1658
1659         /*Search through exception table */
1660         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1661         if (rt_cache)
1662                 rt = rt_cache;
1663
1664         if (rt == net->ipv6.ip6_null_entry) {
1665                 read_unlock_bh(&table->tb6_lock);
1666                 dst_hold(&rt->dst);
1667                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1668                 return rt;
1669         } else if (rt->rt6i_flags & RTF_CACHE) {
1670                 if (ip6_hold_safe(net, &rt, true)) {
1671                         dst_use_noref(&rt->dst, jiffies);
1672                         rt6_dst_from_metrics_check(rt);
1673                 }
1674                 read_unlock_bh(&table->tb6_lock);
1675                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1676                 return rt;
1677         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1678                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1679                 /* Create a RTF_CACHE clone which will not be
1680                  * owned by the fib6 tree.  It is for the special case where
1681                  * the daddr in the skb during the neighbor look-up is different
1682                  * from the fl6->daddr used to look-up route here.
1683                  */
1684
1685                 struct rt6_info *uncached_rt;
1686
1687                 if (ip6_hold_safe(net, &rt, true)) {
1688                         dst_use_noref(&rt->dst, jiffies);
1689                 } else {
1690                         read_unlock_bh(&table->tb6_lock);
1691                         uncached_rt = rt;
1692                         goto uncached_rt_out;
1693                 }
1694                 read_unlock_bh(&table->tb6_lock);
1695
1696                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1697                 dst_release(&rt->dst);
1698
1699                 if (uncached_rt) {
1700                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1701                          * No need for another dst_hold()
1702                          */
1703                         rt6_uncached_list_add(uncached_rt);
1704                 } else {
1705                         uncached_rt = net->ipv6.ip6_null_entry;
1706                         dst_hold(&uncached_rt->dst);
1707                 }
1708
1709 uncached_rt_out:
1710                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1711                 return uncached_rt;
1712
1713         } else {
1714                 /* Get a percpu copy */
1715
1716                 struct rt6_info *pcpu_rt;
1717
1718                 dst_use_noref(&rt->dst, jiffies);
1719                 pcpu_rt = rt6_get_pcpu_route(rt);
1720
1721                 if (pcpu_rt) {
1722                         read_unlock_bh(&table->tb6_lock);
1723                 } else {
1724                         /* atomic_inc_not_zero() is needed when using rcu */
1725                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1726                                 /* We have to do the read_unlock first
1727                                  * because rt6_make_pcpu_route() may trigger
1728                                  * ip6_dst_gc() which will take the write_lock.
1729                                  *
1730                                  * No dst_hold() on rt is needed because grabbing
1731                                  * rt->rt6i_ref makes sure rt can't be released.
1732                                  */
1733                                 read_unlock_bh(&table->tb6_lock);
1734                                 pcpu_rt = rt6_make_pcpu_route(rt);
1735                                 rt6_release(rt);
1736                         } else {
1737                                 /* rt is already removed from tree */
1738                                 read_unlock_bh(&table->tb6_lock);
1739                                 pcpu_rt = net->ipv6.ip6_null_entry;
1740                                 dst_hold(&pcpu_rt->dst);
1741                         }
1742                 }
1743
1744                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1745                 return pcpu_rt;
1746         }
1747 }
1748 EXPORT_SYMBOL_GPL(ip6_pol_route);
1749
1750 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1751                                             struct flowi6 *fl6, int flags)
1752 {
1753         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1754 }
1755
1756 struct dst_entry *ip6_route_input_lookup(struct net *net,
1757                                          struct net_device *dev,
1758                                          struct flowi6 *fl6, int flags)
1759 {
1760         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1761                 flags |= RT6_LOOKUP_F_IFACE;
1762
1763         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1764 }
1765 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1766
1767 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1768                                   struct flow_keys *keys)
1769 {
1770         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1771         const struct ipv6hdr *key_iph = outer_iph;
1772         const struct ipv6hdr *inner_iph;
1773         const struct icmp6hdr *icmph;
1774         struct ipv6hdr _inner_iph;
1775
1776         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1777                 goto out;
1778
1779         icmph = icmp6_hdr(skb);
1780         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1781             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1782             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1783             icmph->icmp6_type != ICMPV6_PARAMPROB)
1784                 goto out;
1785
1786         inner_iph = skb_header_pointer(skb,
1787                                        skb_transport_offset(skb) + sizeof(*icmph),
1788                                        sizeof(_inner_iph), &_inner_iph);
1789         if (!inner_iph)
1790                 goto out;
1791
1792         key_iph = inner_iph;
1793 out:
1794         memset(keys, 0, sizeof(*keys));
1795         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1796         keys->addrs.v6addrs.src = key_iph->saddr;
1797         keys->addrs.v6addrs.dst = key_iph->daddr;
1798         keys->tags.flow_label = ip6_flowinfo(key_iph);
1799         keys->basic.ip_proto = key_iph->nexthdr;
1800 }
1801
1802 /* if skb is set it will be used and fl6 can be NULL */
1803 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1804 {
1805         struct flow_keys hash_keys;
1806
1807         if (skb) {
1808                 ip6_multipath_l3_keys(skb, &hash_keys);
1809                 return flow_hash_from_keys(&hash_keys);
1810         }
1811
1812         return get_hash_from_flowi6(fl6);
1813 }
1814
1815 void ip6_route_input(struct sk_buff *skb)
1816 {
1817         const struct ipv6hdr *iph = ipv6_hdr(skb);
1818         struct net *net = dev_net(skb->dev);
1819         int flags = RT6_LOOKUP_F_HAS_SADDR;
1820         struct ip_tunnel_info *tun_info;
1821         struct flowi6 fl6 = {
1822                 .flowi6_iif = skb->dev->ifindex,
1823                 .daddr = iph->daddr,
1824                 .saddr = iph->saddr,
1825                 .flowlabel = ip6_flowinfo(iph),
1826                 .flowi6_mark = skb->mark,
1827                 .flowi6_proto = iph->nexthdr,
1828         };
1829
1830         tun_info = skb_tunnel_info(skb);
1831         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1832                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1833         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1834                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1835         skb_dst_drop(skb);
1836         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1837 }
1838
1839 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1840                                              struct flowi6 *fl6, int flags)
1841 {
1842         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1843 }
1844
1845 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1846                                          struct flowi6 *fl6, int flags)
1847 {
1848         bool any_src;
1849
1850         if (rt6_need_strict(&fl6->daddr)) {
1851                 struct dst_entry *dst;
1852
1853                 dst = l3mdev_link_scope_lookup(net, fl6);
1854                 if (dst)
1855                         return dst;
1856         }
1857
1858         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1859
1860         any_src = ipv6_addr_any(&fl6->saddr);
1861         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1862             (fl6->flowi6_oif && any_src))
1863                 flags |= RT6_LOOKUP_F_IFACE;
1864
1865         if (!any_src)
1866                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1867         else if (sk)
1868                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1869
1870         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1871 }
1872 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1873
1874 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1875 {
1876         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1877         struct net_device *loopback_dev = net->loopback_dev;
1878         struct dst_entry *new = NULL;
1879
1880         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1881                        DST_OBSOLETE_NONE, 0);
1882         if (rt) {
1883                 rt6_info_init(rt);
1884
1885                 new = &rt->dst;
1886                 new->__use = 1;
1887                 new->input = dst_discard;
1888                 new->output = dst_discard_out;
1889
1890                 dst_copy_metrics(new, &ort->dst);
1891
1892                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1893                 rt->rt6i_gateway = ort->rt6i_gateway;
1894                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1895                 rt->rt6i_metric = 0;
1896
1897                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1898 #ifdef CONFIG_IPV6_SUBTREES
1899                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1900 #endif
1901         }
1902
1903         dst_release(dst_orig);
1904         return new ? new : ERR_PTR(-ENOMEM);
1905 }
1906
1907 /*
1908  *      Destination cache support functions
1909  */
1910
1911 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1912 {
1913         if (rt->dst.from &&
1914             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1915                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1916 }
1917
1918 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1919 {
1920         u32 rt_cookie = 0;
1921
1922         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1923                 return NULL;
1924
1925         if (rt6_check_expired(rt))
1926                 return NULL;
1927
1928         return &rt->dst;
1929 }
1930
1931 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1932 {
1933         if (!__rt6_check_expired(rt) &&
1934             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1935             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1936                 return &rt->dst;
1937         else
1938                 return NULL;
1939 }
1940
1941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1942 {
1943         struct rt6_info *rt;
1944
1945         rt = (struct rt6_info *) dst;
1946
1947         /* All IPV6 dsts are created with ->obsolete set to the value
1948          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1949          * into this function always.
1950          */
1951
1952         rt6_dst_from_metrics_check(rt);
1953
1954         if (rt->rt6i_flags & RTF_PCPU ||
1955             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1956                 return rt6_dst_from_check(rt, cookie);
1957         else
1958                 return rt6_check(rt, cookie);
1959 }
1960
1961 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1962 {
1963         struct rt6_info *rt = (struct rt6_info *) dst;
1964
1965         if (rt) {
1966                 if (rt->rt6i_flags & RTF_CACHE) {
1967                         if (rt6_check_expired(rt)) {
1968                                 ip6_del_rt(rt);
1969                                 dst = NULL;
1970                         }
1971                 } else {
1972                         dst_release(dst);
1973                         dst = NULL;
1974                 }
1975         }
1976         return dst;
1977 }
1978
1979 static void ip6_link_failure(struct sk_buff *skb)
1980 {
1981         struct rt6_info *rt;
1982
1983         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1984
1985         rt = (struct rt6_info *) skb_dst(skb);
1986         if (rt) {
1987                 if (rt->rt6i_flags & RTF_CACHE) {
1988                         if (dst_hold_safe(&rt->dst))
1989                                 ip6_del_rt(rt);
1990                 } else {
1991                         struct fib6_node *fn;
1992
1993                         rcu_read_lock();
1994                         fn = rcu_dereference(rt->rt6i_node);
1995                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1996                                 fn->fn_sernum = -1;
1997                         rcu_read_unlock();
1998                 }
1999         }
2000 }
2001
2002 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2003 {
2004         struct net *net = dev_net(rt->dst.dev);
2005
2006         rt->rt6i_flags |= RTF_MODIFIED;
2007         rt->rt6i_pmtu = mtu;
2008         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2009 }
2010
2011 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2012 {
2013         return !(rt->rt6i_flags & RTF_CACHE) &&
2014                 (rt->rt6i_flags & RTF_PCPU ||
2015                  rcu_access_pointer(rt->rt6i_node));
2016 }
2017
2018 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2019                                  const struct ipv6hdr *iph, u32 mtu)
2020 {
2021         const struct in6_addr *daddr, *saddr;
2022         struct rt6_info *rt6 = (struct rt6_info *)dst;
2023
2024         if (rt6->rt6i_flags & RTF_LOCAL)
2025                 return;
2026
2027         if (dst_metric_locked(dst, RTAX_MTU))
2028                 return;
2029
2030         if (iph) {
2031                 daddr = &iph->daddr;
2032                 saddr = &iph->saddr;
2033         } else if (sk) {
2034                 daddr = &sk->sk_v6_daddr;
2035                 saddr = &inet6_sk(sk)->saddr;
2036         } else {
2037                 daddr = NULL;
2038                 saddr = NULL;
2039         }
2040         dst_confirm_neigh(dst, daddr);
2041         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2042         if (mtu >= dst_mtu(dst))
2043                 return;
2044
2045         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2046                 rt6_do_update_pmtu(rt6, mtu);
2047                 /* update rt6_ex->stamp for cache */
2048                 if (rt6->rt6i_flags & RTF_CACHE)
2049                         rt6_update_exception_stamp_rt(rt6);
2050         } else if (daddr) {
2051                 struct rt6_info *nrt6;
2052
2053                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2054                 if (nrt6) {
2055                         rt6_do_update_pmtu(nrt6, mtu);
2056                         if (rt6_insert_exception(nrt6, rt6))
2057                                 dst_release_immediate(&nrt6->dst);
2058                 }
2059         }
2060 }
2061
2062 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2063                                struct sk_buff *skb, u32 mtu)
2064 {
2065         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2066 }
2067
2068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2069                      int oif, u32 mark, kuid_t uid)
2070 {
2071         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2072         struct dst_entry *dst;
2073         struct flowi6 fl6;
2074
2075         memset(&fl6, 0, sizeof(fl6));
2076         fl6.flowi6_oif = oif;
2077         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2078         fl6.daddr = iph->daddr;
2079         fl6.saddr = iph->saddr;
2080         fl6.flowlabel = ip6_flowinfo(iph);
2081         fl6.flowi6_uid = uid;
2082
2083         dst = ip6_route_output(net, NULL, &fl6);
2084         if (!dst->error)
2085                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2086         dst_release(dst);
2087 }
2088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2089
2090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2091 {
2092         struct dst_entry *dst;
2093
2094         ip6_update_pmtu(skb, sock_net(sk), mtu,
2095                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2096
2097         dst = __sk_dst_get(sk);
2098         if (!dst || !dst->obsolete ||
2099             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2100                 return;
2101
2102         bh_lock_sock(sk);
2103         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2104                 ip6_datagram_dst_update(sk, false);
2105         bh_unlock_sock(sk);
2106 }
2107 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2108
2109 /* Handle redirects */
2110 struct ip6rd_flowi {
2111         struct flowi6 fl6;
2112         struct in6_addr gateway;
2113 };
2114
2115 static struct rt6_info *__ip6_route_redirect(struct net *net,
2116                                              struct fib6_table *table,
2117                                              struct flowi6 *fl6,
2118                                              int flags)
2119 {
2120         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2121         struct rt6_info *rt, *rt_cache;
2122         struct fib6_node *fn;
2123
2124         /* Get the "current" route for this destination and
2125          * check if the redirect has come from appropriate router.
2126          *
2127          * RFC 4861 specifies that redirects should only be
2128          * accepted if they come from the nexthop to the target.
2129          * Due to the way the routes are chosen, this notion
2130          * is a bit fuzzy and one might need to check all possible
2131          * routes.
2132          */
2133
2134         read_lock_bh(&table->tb6_lock);
2135         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2136 restart:
2137         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2138                 if (rt6_check_expired(rt))
2139                         continue;
2140                 if (rt->dst.error)
2141                         break;
2142                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2143                         continue;
2144                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2145                         continue;
2146                 /* rt_cache's gateway might be different from its 'parent'
2147                  * in the case of an ip redirect.
2148                  * So we keep searching in the exception table if the gateway
2149                  * is different.
2150                  */
2151                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2152                         rt_cache = rt6_find_cached_rt(rt,
2153                                                       &fl6->daddr,
2154                                                       &fl6->saddr);
2155                         if (rt_cache &&
2156                             ipv6_addr_equal(&rdfl->gateway,
2157                                             &rt_cache->rt6i_gateway)) {
2158                                 rt = rt_cache;
2159                                 break;
2160                         }
2161                         continue;
2162                 }
2163                 break;
2164         }
2165
2166         if (!rt)
2167                 rt = net->ipv6.ip6_null_entry;
2168         else if (rt->dst.error) {
2169                 rt = net->ipv6.ip6_null_entry;
2170                 goto out;
2171         }
2172
2173         if (rt == net->ipv6.ip6_null_entry) {
2174                 fn = fib6_backtrack(fn, &fl6->saddr);
2175                 if (fn)
2176                         goto restart;
2177         }
2178
2179 out:
2180         ip6_hold_safe(net, &rt, true);
2181
2182         read_unlock_bh(&table->tb6_lock);
2183
2184         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2185         return rt;
2186 };
2187
2188 static struct dst_entry *ip6_route_redirect(struct net *net,
2189                                         const struct flowi6 *fl6,
2190                                         const struct in6_addr *gateway)
2191 {
2192         int flags = RT6_LOOKUP_F_HAS_SADDR;
2193         struct ip6rd_flowi rdfl;
2194
2195         rdfl.fl6 = *fl6;
2196         rdfl.gateway = *gateway;
2197
2198         return fib6_rule_lookup(net, &rdfl.fl6,
2199                                 flags, __ip6_route_redirect);
2200 }
2201
2202 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2203                   kuid_t uid)
2204 {
2205         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2206         struct dst_entry *dst;
2207         struct flowi6 fl6;
2208
2209         memset(&fl6, 0, sizeof(fl6));
2210         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2211         fl6.flowi6_oif = oif;
2212         fl6.flowi6_mark = mark;
2213         fl6.daddr = iph->daddr;
2214         fl6.saddr = iph->saddr;
2215         fl6.flowlabel = ip6_flowinfo(iph);
2216         fl6.flowi6_uid = uid;
2217
2218         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2219         rt6_do_redirect(dst, NULL, skb);
2220         dst_release(dst);
2221 }
2222 EXPORT_SYMBOL_GPL(ip6_redirect);
2223
2224 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2225                             u32 mark)
2226 {
2227         const struct ipv6hdr *iph = ipv6_hdr(skb);
2228         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2229         struct dst_entry *dst;
2230         struct flowi6 fl6;
2231
2232         memset(&fl6, 0, sizeof(fl6));
2233         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2234         fl6.flowi6_oif = oif;
2235         fl6.flowi6_mark = mark;
2236         fl6.daddr = msg->dest;
2237         fl6.saddr = iph->daddr;
2238         fl6.flowi6_uid = sock_net_uid(net, NULL);
2239
2240         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2241         rt6_do_redirect(dst, NULL, skb);
2242         dst_release(dst);
2243 }
2244
2245 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2246 {
2247         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2248                      sk->sk_uid);
2249 }
2250 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2251
2252 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2253 {
2254         struct net_device *dev = dst->dev;
2255         unsigned int mtu = dst_mtu(dst);
2256         struct net *net = dev_net(dev);
2257
2258         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2259
2260         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2261                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2262
2263         /*
2264          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2265          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2266          * IPV6_MAXPLEN is also valid and means: "any MSS,
2267          * rely only on pmtu discovery"
2268          */
2269         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2270                 mtu = IPV6_MAXPLEN;
2271         return mtu;
2272 }
2273
2274 static unsigned int ip6_mtu(const struct dst_entry *dst)
2275 {
2276         const struct rt6_info *rt = (const struct rt6_info *)dst;
2277         unsigned int mtu = rt->rt6i_pmtu;
2278         struct inet6_dev *idev;
2279
2280         if (mtu)
2281                 goto out;
2282
2283         mtu = dst_metric_raw(dst, RTAX_MTU);
2284         if (mtu)
2285                 goto out;
2286
2287         mtu = IPV6_MIN_MTU;
2288
2289         rcu_read_lock();
2290         idev = __in6_dev_get(dst->dev);
2291         if (idev)
2292                 mtu = idev->cnf.mtu6;
2293         rcu_read_unlock();
2294
2295 out:
2296         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2297
2298         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2299 }
2300
2301 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2302                                   struct flowi6 *fl6)
2303 {
2304         struct dst_entry *dst;
2305         struct rt6_info *rt;
2306         struct inet6_dev *idev = in6_dev_get(dev);
2307         struct net *net = dev_net(dev);
2308
2309         if (unlikely(!idev))
2310                 return ERR_PTR(-ENODEV);
2311
2312         rt = ip6_dst_alloc(net, dev, 0);
2313         if (unlikely(!rt)) {
2314                 in6_dev_put(idev);
2315                 dst = ERR_PTR(-ENOMEM);
2316                 goto out;
2317         }
2318
2319         rt->dst.flags |= DST_HOST;
2320         rt->dst.output  = ip6_output;
2321         rt->rt6i_gateway  = fl6->daddr;
2322         rt->rt6i_dst.addr = fl6->daddr;
2323         rt->rt6i_dst.plen = 128;
2324         rt->rt6i_idev     = idev;
2325         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2326
2327         /* Add this dst into uncached_list so that rt6_ifdown() can
2328          * do proper release of the net_device
2329          */
2330         rt6_uncached_list_add(rt);
2331
2332         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2333
2334 out:
2335         return dst;
2336 }
2337
2338 static int ip6_dst_gc(struct dst_ops *ops)
2339 {
2340         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2341         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2342         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2343         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2344         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2345         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2346         int entries;
2347
2348         entries = dst_entries_get_fast(ops);
2349         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2350             entries <= rt_max_size)
2351                 goto out;
2352
2353         net->ipv6.ip6_rt_gc_expire++;
2354         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2355         entries = dst_entries_get_slow(ops);
2356         if (entries < ops->gc_thresh)
2357                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2358 out:
2359         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2360         return entries > rt_max_size;
2361 }
2362
2363 static int ip6_convert_metrics(struct mx6_config *mxc,
2364                                const struct fib6_config *cfg)
2365 {
2366         bool ecn_ca = false;
2367         struct nlattr *nla;
2368         int remaining;
2369         u32 *mp;
2370
2371         if (!cfg->fc_mx)
2372                 return 0;
2373
2374         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2375         if (unlikely(!mp))
2376                 return -ENOMEM;
2377
2378         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2379                 int type = nla_type(nla);
2380                 u32 val;
2381
2382                 if (!type)
2383                         continue;
2384                 if (unlikely(type > RTAX_MAX))
2385                         goto err;
2386
2387                 if (type == RTAX_CC_ALGO) {
2388                         char tmp[TCP_CA_NAME_MAX];
2389
2390                         nla_strlcpy(tmp, nla, sizeof(tmp));
2391                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2392                         if (val == TCP_CA_UNSPEC)
2393                                 goto err;
2394                 } else {
2395                         val = nla_get_u32(nla);
2396                 }
2397                 if (type == RTAX_HOPLIMIT && val > 255)
2398                         val = 255;
2399                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2400                         goto err;
2401
2402                 mp[type - 1] = val;
2403                 __set_bit(type - 1, mxc->mx_valid);
2404         }
2405
2406         if (ecn_ca) {
2407                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2408                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2409         }
2410
2411         mxc->mx = mp;
2412         return 0;
2413  err:
2414         kfree(mp);
2415         return -EINVAL;
2416 }
2417
2418 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2419                                             struct fib6_config *cfg,
2420                                             const struct in6_addr *gw_addr)
2421 {
2422         struct flowi6 fl6 = {
2423                 .flowi6_oif = cfg->fc_ifindex,
2424                 .daddr = *gw_addr,
2425                 .saddr = cfg->fc_prefsrc,
2426         };
2427         struct fib6_table *table;
2428         struct rt6_info *rt;
2429         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2430
2431         table = fib6_get_table(net, cfg->fc_table);
2432         if (!table)
2433                 return NULL;
2434
2435         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2436                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2437
2438         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2439
2440         /* if table lookup failed, fall back to full lookup */
2441         if (rt == net->ipv6.ip6_null_entry) {
2442                 ip6_rt_put(rt);
2443                 rt = NULL;
2444         }
2445
2446         return rt;
2447 }
2448
2449 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2450                                               struct netlink_ext_ack *extack)
2451 {
2452         struct net *net = cfg->fc_nlinfo.nl_net;
2453         struct rt6_info *rt = NULL;
2454         struct net_device *dev = NULL;
2455         struct inet6_dev *idev = NULL;
2456         struct fib6_table *table;
2457         int addr_type;
2458         int err = -EINVAL;
2459
2460         /* RTF_PCPU is an internal flag; can not be set by userspace */
2461         if (cfg->fc_flags & RTF_PCPU) {
2462                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2463                 goto out;
2464         }
2465
2466         if (cfg->fc_dst_len > 128) {
2467                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2468                 goto out;
2469         }
2470         if (cfg->fc_src_len > 128) {
2471                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2472                 goto out;
2473         }
2474 #ifndef CONFIG_IPV6_SUBTREES
2475         if (cfg->fc_src_len) {
2476                 NL_SET_ERR_MSG(extack,
2477                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2478                 goto out;
2479         }
2480 #endif
2481         if (cfg->fc_ifindex) {
2482                 err = -ENODEV;
2483                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2484                 if (!dev)
2485                         goto out;
2486                 idev = in6_dev_get(dev);
2487                 if (!idev)
2488                         goto out;
2489         }
2490
2491         if (cfg->fc_metric == 0)
2492                 cfg->fc_metric = IP6_RT_PRIO_USER;
2493
2494         err = -ENOBUFS;
2495         if (cfg->fc_nlinfo.nlh &&
2496             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2497                 table = fib6_get_table(net, cfg->fc_table);
2498                 if (!table) {
2499                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2500                         table = fib6_new_table(net, cfg->fc_table);
2501                 }
2502         } else {
2503                 table = fib6_new_table(net, cfg->fc_table);
2504         }
2505
2506         if (!table)
2507                 goto out;
2508
2509         rt = ip6_dst_alloc(net, NULL,
2510                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2511
2512         if (!rt) {
2513                 err = -ENOMEM;
2514                 goto out;
2515         }
2516
2517         if (cfg->fc_flags & RTF_EXPIRES)
2518                 rt6_set_expires(rt, jiffies +
2519                                 clock_t_to_jiffies(cfg->fc_expires));
2520         else
2521                 rt6_clean_expires(rt);
2522
2523         if (cfg->fc_protocol == RTPROT_UNSPEC)
2524                 cfg->fc_protocol = RTPROT_BOOT;
2525         rt->rt6i_protocol = cfg->fc_protocol;
2526
2527         addr_type = ipv6_addr_type(&cfg->fc_dst);
2528
2529         if (addr_type & IPV6_ADDR_MULTICAST)
2530                 rt->dst.input = ip6_mc_input;
2531         else if (cfg->fc_flags & RTF_LOCAL)
2532                 rt->dst.input = ip6_input;
2533         else
2534                 rt->dst.input = ip6_forward;
2535
2536         rt->dst.output = ip6_output;
2537
2538         if (cfg->fc_encap) {
2539                 struct lwtunnel_state *lwtstate;
2540
2541                 err = lwtunnel_build_state(cfg->fc_encap_type,
2542                                            cfg->fc_encap, AF_INET6, cfg,
2543                                            &lwtstate, extack);
2544                 if (err)
2545                         goto out;
2546                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2547                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2548                         rt->dst.lwtstate->orig_output = rt->dst.output;
2549                         rt->dst.output = lwtunnel_output;
2550                 }
2551                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2552                         rt->dst.lwtstate->orig_input = rt->dst.input;
2553                         rt->dst.input = lwtunnel_input;
2554                 }
2555         }
2556
2557         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2558         rt->rt6i_dst.plen = cfg->fc_dst_len;
2559         if (rt->rt6i_dst.plen == 128)
2560                 rt->dst.flags |= DST_HOST;
2561
2562 #ifdef CONFIG_IPV6_SUBTREES
2563         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2564         rt->rt6i_src.plen = cfg->fc_src_len;
2565 #endif
2566
2567         rt->rt6i_metric = cfg->fc_metric;
2568
2569         /* We cannot add true routes via loopback here,
2570            they would result in kernel looping; promote them to reject routes
2571          */
2572         if ((cfg->fc_flags & RTF_REJECT) ||
2573             (dev && (dev->flags & IFF_LOOPBACK) &&
2574              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2575              !(cfg->fc_flags & RTF_LOCAL))) {
2576                 /* hold loopback dev/idev if we haven't done so. */
2577                 if (dev != net->loopback_dev) {
2578                         if (dev) {
2579                                 dev_put(dev);
2580                                 in6_dev_put(idev);
2581                         }
2582                         dev = net->loopback_dev;
2583                         dev_hold(dev);
2584                         idev = in6_dev_get(dev);
2585                         if (!idev) {
2586                                 err = -ENODEV;
2587                                 goto out;
2588                         }
2589                 }
2590                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2591                 switch (cfg->fc_type) {
2592                 case RTN_BLACKHOLE:
2593                         rt->dst.error = -EINVAL;
2594                         rt->dst.output = dst_discard_out;
2595                         rt->dst.input = dst_discard;
2596                         break;
2597                 case RTN_PROHIBIT:
2598                         rt->dst.error = -EACCES;
2599                         rt->dst.output = ip6_pkt_prohibit_out;
2600                         rt->dst.input = ip6_pkt_prohibit;
2601                         break;
2602                 case RTN_THROW:
2603                 case RTN_UNREACHABLE:
2604                 default:
2605                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2606                                         : (cfg->fc_type == RTN_UNREACHABLE)
2607                                         ? -EHOSTUNREACH : -ENETUNREACH;
2608                         rt->dst.output = ip6_pkt_discard_out;
2609                         rt->dst.input = ip6_pkt_discard;
2610                         break;
2611                 }
2612                 goto install_route;
2613         }
2614
2615         if (cfg->fc_flags & RTF_GATEWAY) {
2616                 const struct in6_addr *gw_addr;
2617                 int gwa_type;
2618
2619                 gw_addr = &cfg->fc_gateway;
2620                 gwa_type = ipv6_addr_type(gw_addr);
2621
2622                 /* if gw_addr is local we will fail to detect this in case
2623                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2624                  * will return already-added prefix route via interface that
2625                  * prefix route was assigned to, which might be non-loopback.
2626                  */
2627                 err = -EINVAL;
2628                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2629                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2630                                             dev : NULL, 0, 0)) {
2631                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2632                         goto out;
2633                 }
2634                 rt->rt6i_gateway = *gw_addr;
2635
2636                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2637                         struct rt6_info *grt = NULL;
2638
2639                         /* IPv6 strictly inhibits using not link-local
2640                            addresses as nexthop address.
2641                            Otherwise, router will not able to send redirects.
2642                            It is very good, but in some (rare!) circumstances
2643                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2644                            some exceptions. --ANK
2645                            We allow IPv4-mapped nexthops to support RFC4798-type
2646                            addressing
2647                          */
2648                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2649                                           IPV6_ADDR_MAPPED))) {
2650                                 NL_SET_ERR_MSG(extack,
2651                                                "Invalid gateway address");
2652                                 goto out;
2653                         }
2654
2655                         if (cfg->fc_table) {
2656                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2657
2658                                 if (grt) {
2659                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2660                                             (dev && dev != grt->dst.dev)) {
2661                                                 ip6_rt_put(grt);
2662                                                 grt = NULL;
2663                                         }
2664                                 }
2665                         }
2666
2667                         if (!grt)
2668                                 grt = rt6_lookup(net, gw_addr, NULL,
2669                                                  cfg->fc_ifindex, 1);
2670
2671                         err = -EHOSTUNREACH;
2672                         if (!grt)
2673                                 goto out;
2674                         if (dev) {
2675                                 if (dev != grt->dst.dev) {
2676                                         ip6_rt_put(grt);
2677                                         goto out;
2678                                 }
2679                         } else {
2680                                 dev = grt->dst.dev;
2681                                 idev = grt->rt6i_idev;
2682                                 dev_hold(dev);
2683                                 in6_dev_hold(grt->rt6i_idev);
2684                         }
2685                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2686                                 err = 0;
2687                         ip6_rt_put(grt);
2688
2689                         if (err)
2690                                 goto out;
2691                 }
2692                 err = -EINVAL;
2693                 if (!dev) {
2694                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2695                         goto out;
2696                 } else if (dev->flags & IFF_LOOPBACK) {
2697                         NL_SET_ERR_MSG(extack,
2698                                        "Egress device can not be loopback device for this route");
2699                         goto out;
2700                 }
2701         }
2702
2703         err = -ENODEV;
2704         if (!dev)
2705                 goto out;
2706
2707         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2708                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2709                         NL_SET_ERR_MSG(extack, "Invalid source address");
2710                         err = -EINVAL;
2711                         goto out;
2712                 }
2713                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2714                 rt->rt6i_prefsrc.plen = 128;
2715         } else
2716                 rt->rt6i_prefsrc.plen = 0;
2717
2718         rt->rt6i_flags = cfg->fc_flags;
2719
2720 install_route:
2721         rt->dst.dev = dev;
2722         rt->rt6i_idev = idev;
2723         rt->rt6i_table = table;
2724
2725         cfg->fc_nlinfo.nl_net = dev_net(dev);
2726
2727         return rt;
2728 out:
2729         if (dev)
2730                 dev_put(dev);
2731         if (idev)
2732                 in6_dev_put(idev);
2733         if (rt)
2734                 dst_release_immediate(&rt->dst);
2735
2736         return ERR_PTR(err);
2737 }
2738
2739 int ip6_route_add(struct fib6_config *cfg,
2740                   struct netlink_ext_ack *extack)
2741 {
2742         struct mx6_config mxc = { .mx = NULL, };
2743         struct rt6_info *rt;
2744         int err;
2745
2746         rt = ip6_route_info_create(cfg, extack);
2747         if (IS_ERR(rt)) {
2748                 err = PTR_ERR(rt);
2749                 rt = NULL;
2750                 goto out;
2751         }
2752
2753         err = ip6_convert_metrics(&mxc, cfg);
2754         if (err)
2755                 goto out;
2756
2757         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2758
2759         kfree(mxc.mx);
2760
2761         return err;
2762 out:
2763         if (rt)
2764                 dst_release_immediate(&rt->dst);
2765
2766         return err;
2767 }
2768
2769 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2770 {
2771         int err;
2772         struct fib6_table *table;
2773         struct net *net = dev_net(rt->dst.dev);
2774
2775         if (rt == net->ipv6.ip6_null_entry) {
2776                 err = -ENOENT;
2777                 goto out;
2778         }
2779
2780         table = rt->rt6i_table;
2781         write_lock_bh(&table->tb6_lock);
2782         err = fib6_del(rt, info);
2783         write_unlock_bh(&table->tb6_lock);
2784
2785 out:
2786         ip6_rt_put(rt);
2787         return err;
2788 }
2789
2790 int ip6_del_rt(struct rt6_info *rt)
2791 {
2792         struct nl_info info = {
2793                 .nl_net = dev_net(rt->dst.dev),
2794         };
2795         return __ip6_del_rt(rt, &info);
2796 }
2797
2798 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2799 {
2800         struct nl_info *info = &cfg->fc_nlinfo;
2801         struct net *net = info->nl_net;
2802         struct sk_buff *skb = NULL;
2803         struct fib6_table *table;
2804         int err = -ENOENT;
2805
2806         if (rt == net->ipv6.ip6_null_entry)
2807                 goto out_put;
2808         table = rt->rt6i_table;
2809         write_lock_bh(&table->tb6_lock);
2810
2811         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2812                 struct rt6_info *sibling, *next_sibling;
2813
2814                 /* prefer to send a single notification with all hops */
2815                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2816                 if (skb) {
2817                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2818
2819                         if (rt6_fill_node(net, skb, rt,
2820                                           NULL, NULL, 0, RTM_DELROUTE,
2821                                           info->portid, seq, 0) < 0) {
2822                                 kfree_skb(skb);
2823                                 skb = NULL;
2824                         } else
2825                                 info->skip_notify = 1;
2826                 }
2827
2828                 list_for_each_entry_safe(sibling, next_sibling,
2829                                          &rt->rt6i_siblings,
2830                                          rt6i_siblings) {
2831                         err = fib6_del(sibling, info);
2832                         if (err)
2833                                 goto out_unlock;
2834                 }
2835         }
2836
2837         err = fib6_del(rt, info);
2838 out_unlock:
2839         write_unlock_bh(&table->tb6_lock);
2840 out_put:
2841         ip6_rt_put(rt);
2842
2843         if (skb) {
2844                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2845                             info->nlh, gfp_any());
2846         }
2847         return err;
2848 }
2849
2850 static int ip6_route_del(struct fib6_config *cfg,
2851                          struct netlink_ext_ack *extack)
2852 {
2853         struct rt6_info *rt, *rt_cache;
2854         struct fib6_table *table;
2855         struct fib6_node *fn;
2856         int err = -ESRCH;
2857
2858         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2859         if (!table) {
2860                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2861                 return err;
2862         }
2863
2864         read_lock_bh(&table->tb6_lock);
2865
2866         fn = fib6_locate(&table->tb6_root,
2867                          &cfg->fc_dst, cfg->fc_dst_len,
2868                          &cfg->fc_src, cfg->fc_src_len,
2869                          !(cfg->fc_flags & RTF_CACHE));
2870
2871         if (fn) {
2872                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2873                         if (cfg->fc_flags & RTF_CACHE) {
2874                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2875                                                               &cfg->fc_src);
2876                                 if (!rt_cache)
2877                                         continue;
2878                                 rt = rt_cache;
2879                         }
2880                         if (cfg->fc_ifindex &&
2881                             (!rt->dst.dev ||
2882                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2883                                 continue;
2884                         if (cfg->fc_flags & RTF_GATEWAY &&
2885                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2886                                 continue;
2887                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2888                                 continue;
2889                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2890                                 continue;
2891                         if (!dst_hold_safe(&rt->dst))
2892                                 break;
2893                         read_unlock_bh(&table->tb6_lock);
2894
2895                         /* if gateway was specified only delete the one hop */
2896                         if (cfg->fc_flags & RTF_GATEWAY)
2897                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2898
2899                         return __ip6_del_rt_siblings(rt, cfg);
2900                 }
2901         }
2902         read_unlock_bh(&table->tb6_lock);
2903
2904         return err;
2905 }
2906
2907 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2908 {
2909         struct netevent_redirect netevent;
2910         struct rt6_info *rt, *nrt = NULL;
2911         struct ndisc_options ndopts;
2912         struct inet6_dev *in6_dev;
2913         struct neighbour *neigh;
2914         struct rd_msg *msg;
2915         int optlen, on_link;
2916         u8 *lladdr;
2917
2918         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2919         optlen -= sizeof(*msg);
2920
2921         if (optlen < 0) {
2922                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2923                 return;
2924         }
2925
2926         msg = (struct rd_msg *)icmp6_hdr(skb);
2927
2928         if (ipv6_addr_is_multicast(&msg->dest)) {
2929                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2930                 return;
2931         }
2932
2933         on_link = 0;
2934         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2935                 on_link = 1;
2936         } else if (ipv6_addr_type(&msg->target) !=
2937                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2938                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2939                 return;
2940         }
2941
2942         in6_dev = __in6_dev_get(skb->dev);
2943         if (!in6_dev)
2944                 return;
2945         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2946                 return;
2947
2948         /* RFC2461 8.1:
2949          *      The IP source address of the Redirect MUST be the same as the current
2950          *      first-hop router for the specified ICMP Destination Address.
2951          */
2952
2953         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2954                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2955                 return;
2956         }
2957
2958         lladdr = NULL;
2959         if (ndopts.nd_opts_tgt_lladdr) {
2960                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2961                                              skb->dev);
2962                 if (!lladdr) {
2963                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2964                         return;
2965                 }
2966         }
2967
2968         rt = (struct rt6_info *) dst;
2969         if (rt->rt6i_flags & RTF_REJECT) {
2970                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2971                 return;
2972         }
2973
2974         /* Redirect received -> path was valid.
2975          * Look, redirects are sent only in response to data packets,
2976          * so that this nexthop apparently is reachable. --ANK
2977          */
2978         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2979
2980         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2981         if (!neigh)
2982                 return;
2983
2984         /*
2985          *      We have finally decided to accept it.
2986          */
2987
2988         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2989                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2990                      NEIGH_UPDATE_F_OVERRIDE|
2991                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2992                                      NEIGH_UPDATE_F_ISROUTER)),
2993                      NDISC_REDIRECT, &ndopts);
2994
2995         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2996         if (!nrt)
2997                 goto out;
2998
2999         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3000         if (on_link)
3001                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3002
3003         nrt->rt6i_protocol = RTPROT_REDIRECT;
3004         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3005
3006         /* No need to remove rt from the exception table if rt is
3007          * a cached route because rt6_insert_exception() will
3008          * takes care of it
3009          */
3010         if (rt6_insert_exception(nrt, rt)) {
3011                 dst_release_immediate(&nrt->dst);
3012                 goto out;
3013         }
3014
3015         netevent.old = &rt->dst;
3016         netevent.new = &nrt->dst;
3017         netevent.daddr = &msg->dest;
3018         netevent.neigh = neigh;
3019         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3020
3021 out:
3022         neigh_release(neigh);
3023 }
3024
3025 /*
3026  *      Misc support functions
3027  */
3028
3029 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3030 {
3031         BUG_ON(from->dst.from);
3032
3033         rt->rt6i_flags &= ~RTF_EXPIRES;
3034         dst_hold(&from->dst);
3035         rt->dst.from = &from->dst;
3036         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3037 }
3038
3039 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3040 {
3041         rt->dst.input = ort->dst.input;
3042         rt->dst.output = ort->dst.output;
3043         rt->rt6i_dst = ort->rt6i_dst;
3044         rt->dst.error = ort->dst.error;
3045         rt->rt6i_idev = ort->rt6i_idev;
3046         if (rt->rt6i_idev)
3047                 in6_dev_hold(rt->rt6i_idev);
3048         rt->dst.lastuse = jiffies;
3049         rt->rt6i_gateway = ort->rt6i_gateway;
3050         rt->rt6i_flags = ort->rt6i_flags;
3051         rt6_set_from(rt, ort);
3052         rt->rt6i_metric = ort->rt6i_metric;
3053 #ifdef CONFIG_IPV6_SUBTREES
3054         rt->rt6i_src = ort->rt6i_src;
3055 #endif
3056         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3057         rt->rt6i_table = ort->rt6i_table;
3058         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3059 }
3060
3061 #ifdef CONFIG_IPV6_ROUTE_INFO
3062 static struct rt6_info *rt6_get_route_info(struct net *net,
3063                                            const struct in6_addr *prefix, int prefixlen,
3064                                            const struct in6_addr *gwaddr,
3065                                            struct net_device *dev)
3066 {
3067         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3068         int ifindex = dev->ifindex;
3069         struct fib6_node *fn;
3070         struct rt6_info *rt = NULL;
3071         struct fib6_table *table;
3072
3073         table = fib6_get_table(net, tb_id);
3074         if (!table)
3075                 return NULL;
3076
3077         read_lock_bh(&table->tb6_lock);
3078         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3079         if (!fn)
3080                 goto out;
3081
3082         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3083                 if (rt->dst.dev->ifindex != ifindex)
3084                         continue;
3085                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3086                         continue;
3087                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3088                         continue;
3089                 ip6_hold_safe(NULL, &rt, false);
3090                 break;
3091         }
3092 out:
3093         read_unlock_bh(&table->tb6_lock);
3094         return rt;
3095 }
3096
3097 static struct rt6_info *rt6_add_route_info(struct net *net,
3098                                            const struct in6_addr *prefix, int prefixlen,
3099                                            const struct in6_addr *gwaddr,
3100                                            struct net_device *dev,
3101                                            unsigned int pref)
3102 {
3103         struct fib6_config cfg = {
3104                 .fc_metric      = IP6_RT_PRIO_USER,
3105                 .fc_ifindex     = dev->ifindex,
3106                 .fc_dst_len     = prefixlen,
3107                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3108                                   RTF_UP | RTF_PREF(pref),
3109                 .fc_protocol = RTPROT_RA,
3110                 .fc_nlinfo.portid = 0,
3111                 .fc_nlinfo.nlh = NULL,
3112                 .fc_nlinfo.nl_net = net,
3113         };
3114
3115         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3116         cfg.fc_dst = *prefix;
3117         cfg.fc_gateway = *gwaddr;
3118
3119         /* We should treat it as a default route if prefix length is 0. */
3120         if (!prefixlen)
3121                 cfg.fc_flags |= RTF_DEFAULT;
3122
3123         ip6_route_add(&cfg, NULL);
3124
3125         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3126 }
3127 #endif
3128
3129 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3130 {
3131         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3132         struct rt6_info *rt;
3133         struct fib6_table *table;
3134
3135         table = fib6_get_table(dev_net(dev), tb_id);
3136         if (!table)
3137                 return NULL;
3138
3139         read_lock_bh(&table->tb6_lock);
3140         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3141                 if (dev == rt->dst.dev &&
3142                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3143                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3144                         break;
3145         }
3146         if (rt)
3147                 ip6_hold_safe(NULL, &rt, false);
3148         read_unlock_bh(&table->tb6_lock);
3149         return rt;
3150 }
3151
3152 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3153                                      struct net_device *dev,
3154                                      unsigned int pref)
3155 {
3156         struct fib6_config cfg = {
3157                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3158                 .fc_metric      = IP6_RT_PRIO_USER,
3159                 .fc_ifindex     = dev->ifindex,
3160                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3161                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3162                 .fc_protocol = RTPROT_RA,
3163                 .fc_nlinfo.portid = 0,
3164                 .fc_nlinfo.nlh = NULL,
3165                 .fc_nlinfo.nl_net = dev_net(dev),
3166         };
3167
3168         cfg.fc_gateway = *gwaddr;
3169
3170         if (!ip6_route_add(&cfg, NULL)) {
3171                 struct fib6_table *table;
3172
3173                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3174                 if (table)
3175                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3176         }
3177
3178         return rt6_get_dflt_router(gwaddr, dev);
3179 }
3180
3181 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3182 {
3183         struct rt6_info *rt;
3184
3185 restart:
3186         read_lock_bh(&table->tb6_lock);
3187         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3188                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3189                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3190                         if (dst_hold_safe(&rt->dst)) {
3191                                 read_unlock_bh(&table->tb6_lock);
3192                                 ip6_del_rt(rt);
3193                         } else {
3194                                 read_unlock_bh(&table->tb6_lock);
3195                         }
3196                         goto restart;
3197                 }
3198         }
3199         read_unlock_bh(&table->tb6_lock);
3200
3201         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3202 }
3203
3204 void rt6_purge_dflt_routers(struct net *net)
3205 {
3206         struct fib6_table *table;
3207         struct hlist_head *head;
3208         unsigned int h;
3209
3210         rcu_read_lock();
3211
3212         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3213                 head = &net->ipv6.fib_table_hash[h];
3214                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3215                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3216                                 __rt6_purge_dflt_routers(table);
3217                 }
3218         }
3219
3220         rcu_read_unlock();
3221 }
3222
3223 static void rtmsg_to_fib6_config(struct net *net,
3224                                  struct in6_rtmsg *rtmsg,
3225                                  struct fib6_config *cfg)
3226 {
3227         memset(cfg, 0, sizeof(*cfg));
3228
3229         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3230                          : RT6_TABLE_MAIN;
3231         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3232         cfg->fc_metric = rtmsg->rtmsg_metric;
3233         cfg->fc_expires = rtmsg->rtmsg_info;
3234         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3235         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3236         cfg->fc_flags = rtmsg->rtmsg_flags;
3237
3238         cfg->fc_nlinfo.nl_net = net;
3239
3240         cfg->fc_dst = rtmsg->rtmsg_dst;
3241         cfg->fc_src = rtmsg->rtmsg_src;
3242         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3243 }
3244
3245 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3246 {
3247         struct fib6_config cfg;
3248         struct in6_rtmsg rtmsg;
3249         int err;
3250
3251         switch (cmd) {
3252         case SIOCADDRT:         /* Add a route */
3253         case SIOCDELRT:         /* Delete a route */
3254                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3255                         return -EPERM;
3256                 err = copy_from_user(&rtmsg, arg,
3257                                      sizeof(struct in6_rtmsg));
3258                 if (err)
3259                         return -EFAULT;
3260
3261                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3262
3263                 rtnl_lock();
3264                 switch (cmd) {
3265                 case SIOCADDRT:
3266                         err = ip6_route_add(&cfg, NULL);
3267                         break;
3268                 case SIOCDELRT:
3269                         err = ip6_route_del(&cfg, NULL);
3270                         break;
3271                 default:
3272                         err = -EINVAL;
3273                 }
3274                 rtnl_unlock();
3275
3276                 return err;
3277         }
3278
3279         return -EINVAL;
3280 }
3281
3282 /*
3283  *      Drop the packet on the floor
3284  */
3285
3286 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3287 {
3288         int type;
3289         struct dst_entry *dst = skb_dst(skb);
3290         switch (ipstats_mib_noroutes) {
3291         case IPSTATS_MIB_INNOROUTES:
3292                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3293                 if (type == IPV6_ADDR_ANY) {
3294                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3295                                       IPSTATS_MIB_INADDRERRORS);
3296                         break;
3297                 }
3298                 /* FALLTHROUGH */
3299         case IPSTATS_MIB_OUTNOROUTES:
3300                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3301                               ipstats_mib_noroutes);
3302                 break;
3303         }
3304         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3305         kfree_skb(skb);
3306         return 0;
3307 }
3308
3309 static int ip6_pkt_discard(struct sk_buff *skb)
3310 {
3311         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3312 }
3313
3314 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3315 {
3316         skb->dev = skb_dst(skb)->dev;
3317         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3318 }
3319
3320 static int ip6_pkt_prohibit(struct sk_buff *skb)
3321 {
3322         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3323 }
3324
3325 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3326 {
3327         skb->dev = skb_dst(skb)->dev;
3328         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3329 }
3330
3331 /*
3332  *      Allocate a dst for local (unicast / anycast) address.
3333  */
3334
3335 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3336                                     const struct in6_addr *addr,
3337                                     bool anycast)
3338 {
3339         u32 tb_id;
3340         struct net *net = dev_net(idev->dev);
3341         struct net_device *dev = idev->dev;
3342         struct rt6_info *rt;
3343
3344         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3345         if (!rt)
3346                 return ERR_PTR(-ENOMEM);
3347
3348         in6_dev_hold(idev);
3349
3350         rt->dst.flags |= DST_HOST;
3351         rt->dst.input = ip6_input;
3352         rt->dst.output = ip6_output;
3353         rt->rt6i_idev = idev;
3354
3355         rt->rt6i_protocol = RTPROT_KERNEL;
3356         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3357         if (anycast)
3358                 rt->rt6i_flags |= RTF_ANYCAST;
3359         else
3360                 rt->rt6i_flags |= RTF_LOCAL;
3361
3362         rt->rt6i_gateway  = *addr;
3363         rt->rt6i_dst.addr = *addr;
3364         rt->rt6i_dst.plen = 128;
3365         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3366         rt->rt6i_table = fib6_get_table(net, tb_id);
3367
3368         return rt;
3369 }
3370
3371 /* remove deleted ip from prefsrc entries */
3372 struct arg_dev_net_ip {
3373         struct net_device *dev;
3374         struct net *net;
3375         struct in6_addr *addr;
3376 };
3377
3378 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3379 {
3380         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3381         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3382         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3383
3384         if (((void *)rt->dst.dev == dev || !dev) &&
3385             rt != net->ipv6.ip6_null_entry &&
3386             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3387                 spin_lock_bh(&rt6_exception_lock);
3388                 /* remove prefsrc entry */
3389                 rt->rt6i_prefsrc.plen = 0;
3390                 /* need to update cache as well */
3391                 rt6_exceptions_remove_prefsrc(rt);
3392                 spin_unlock_bh(&rt6_exception_lock);
3393         }
3394         return 0;
3395 }
3396
3397 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3398 {
3399         struct net *net = dev_net(ifp->idev->dev);
3400         struct arg_dev_net_ip adni = {
3401                 .dev = ifp->idev->dev,
3402                 .net = net,
3403                 .addr = &ifp->addr,
3404         };
3405         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3406 }
3407
3408 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3409
3410 /* Remove routers and update dst entries when gateway turn into host. */
3411 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3412 {
3413         struct in6_addr *gateway = (struct in6_addr *)arg;
3414
3415         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3416             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3417                 return -1;
3418         }
3419
3420         /* Further clean up cached routes in exception table.
3421          * This is needed because cached route may have a different
3422          * gateway than its 'parent' in the case of an ip redirect.
3423          */
3424         rt6_exceptions_clean_tohost(rt, gateway);
3425
3426         return 0;
3427 }
3428
3429 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3430 {
3431         fib6_clean_all(net, fib6_clean_tohost, gateway);
3432 }
3433
3434 struct arg_dev_net {
3435         struct net_device *dev;
3436         struct net *net;
3437 };
3438
3439 /* called with write lock held for table with rt */
3440 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3441 {
3442         const struct arg_dev_net *adn = arg;
3443         const struct net_device *dev = adn->dev;
3444
3445         if ((rt->dst.dev == dev || !dev) &&
3446             rt != adn->net->ipv6.ip6_null_entry &&
3447             (rt->rt6i_nsiblings == 0 ||
3448              (dev && netdev_unregistering(dev)) ||
3449              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3450                 return -1;
3451
3452         return 0;
3453 }
3454
3455 void rt6_ifdown(struct net *net, struct net_device *dev)
3456 {
3457         struct arg_dev_net adn = {
3458                 .dev = dev,
3459                 .net = net,
3460         };
3461
3462         fib6_clean_all(net, fib6_ifdown, &adn);
3463         if (dev)
3464                 rt6_uncached_list_flush_dev(net, dev);
3465 }
3466
3467 struct rt6_mtu_change_arg {
3468         struct net_device *dev;
3469         unsigned int mtu;
3470 };
3471
3472 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3473 {
3474         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3475         struct inet6_dev *idev;
3476
3477         /* In IPv6 pmtu discovery is not optional,
3478            so that RTAX_MTU lock cannot disable it.
3479            We still use this lock to block changes
3480            caused by addrconf/ndisc.
3481         */
3482
3483         idev = __in6_dev_get(arg->dev);
3484         if (!idev)
3485                 return 0;
3486
3487         /* For administrative MTU increase, there is no way to discover
3488            IPv6 PMTU increase, so PMTU increase should be updated here.
3489            Since RFC 1981 doesn't include administrative MTU increase
3490            update PMTU increase is a MUST. (i.e. jumbo frame)
3491          */
3492         /*
3493            If new MTU is less than route PMTU, this new MTU will be the
3494            lowest MTU in the path, update the route PMTU to reflect PMTU
3495            decreases; if new MTU is greater than route PMTU, and the
3496            old MTU is the lowest MTU in the path, update the route PMTU
3497            to reflect the increase. In this case if the other nodes' MTU
3498            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3499            PMTU discovery.
3500          */
3501         if (rt->dst.dev == arg->dev &&
3502             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3503             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3504                 spin_lock_bh(&rt6_exception_lock);
3505                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3506                     (dst_mtu(&rt->dst) < arg->mtu &&
3507                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3508                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3509                 }
3510                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3511                 spin_unlock_bh(&rt6_exception_lock);
3512         }
3513         return 0;
3514 }
3515
3516 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3517 {
3518         struct rt6_mtu_change_arg arg = {
3519                 .dev = dev,
3520                 .mtu = mtu,
3521         };
3522
3523         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3524 }
3525
3526 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3527         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3528         [RTA_OIF]               = { .type = NLA_U32 },
3529         [RTA_IIF]               = { .type = NLA_U32 },
3530         [RTA_PRIORITY]          = { .type = NLA_U32 },
3531         [RTA_METRICS]           = { .type = NLA_NESTED },
3532         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3533         [RTA_PREF]              = { .type = NLA_U8 },
3534         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3535         [RTA_ENCAP]             = { .type = NLA_NESTED },
3536         [RTA_EXPIRES]           = { .type = NLA_U32 },
3537         [RTA_UID]               = { .type = NLA_U32 },
3538         [RTA_MARK]              = { .type = NLA_U32 },
3539 };
3540
3541 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3542                               struct fib6_config *cfg,
3543                               struct netlink_ext_ack *extack)
3544 {
3545         struct rtmsg *rtm;
3546         struct nlattr *tb[RTA_MAX+1];
3547         unsigned int pref;
3548         int err;
3549
3550         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3551                           NULL);
3552         if (err < 0)
3553                 goto errout;
3554
3555         err = -EINVAL;
3556         rtm = nlmsg_data(nlh);
3557         memset(cfg, 0, sizeof(*cfg));
3558
3559         cfg->fc_table = rtm->rtm_table;
3560         cfg->fc_dst_len = rtm->rtm_dst_len;
3561         cfg->fc_src_len = rtm->rtm_src_len;
3562         cfg->fc_flags = RTF_UP;
3563         cfg->fc_protocol = rtm->rtm_protocol;
3564         cfg->fc_type = rtm->rtm_type;
3565
3566         if (rtm->rtm_type == RTN_UNREACHABLE ||
3567             rtm->rtm_type == RTN_BLACKHOLE ||
3568             rtm->rtm_type == RTN_PROHIBIT ||
3569             rtm->rtm_type == RTN_THROW)
3570                 cfg->fc_flags |= RTF_REJECT;
3571
3572         if (rtm->rtm_type == RTN_LOCAL)
3573                 cfg->fc_flags |= RTF_LOCAL;
3574
3575         if (rtm->rtm_flags & RTM_F_CLONED)
3576                 cfg->fc_flags |= RTF_CACHE;
3577
3578         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3579         cfg->fc_nlinfo.nlh = nlh;
3580         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3581
3582         if (tb[RTA_GATEWAY]) {
3583                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3584                 cfg->fc_flags |= RTF_GATEWAY;
3585         }
3586
3587         if (tb[RTA_DST]) {
3588                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3589
3590                 if (nla_len(tb[RTA_DST]) < plen)
3591                         goto errout;
3592
3593                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3594         }
3595
3596         if (tb[RTA_SRC]) {
3597                 int plen = (rtm->rtm_src_len + 7) >> 3;
3598
3599                 if (nla_len(tb[RTA_SRC]) < plen)
3600                         goto errout;
3601
3602                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3603         }
3604
3605         if (tb[RTA_PREFSRC])
3606                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3607
3608         if (tb[RTA_OIF])
3609                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3610
3611         if (tb[RTA_PRIORITY])
3612                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3613
3614         if (tb[RTA_METRICS]) {
3615                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3616                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3617         }
3618
3619         if (tb[RTA_TABLE])
3620                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3621
3622         if (tb[RTA_MULTIPATH]) {
3623                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3624                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3625
3626                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3627                                                      cfg->fc_mp_len, extack);
3628                 if (err < 0)
3629                         goto errout;
3630         }
3631
3632         if (tb[RTA_PREF]) {
3633                 pref = nla_get_u8(tb[RTA_PREF]);
3634                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3635                     pref != ICMPV6_ROUTER_PREF_HIGH)
3636                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3637                 cfg->fc_flags |= RTF_PREF(pref);
3638         }
3639
3640         if (tb[RTA_ENCAP])
3641                 cfg->fc_encap = tb[RTA_ENCAP];
3642
3643         if (tb[RTA_ENCAP_TYPE]) {
3644                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3645
3646                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3647                 if (err < 0)
3648                         goto errout;
3649         }
3650
3651         if (tb[RTA_EXPIRES]) {
3652                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3653
3654                 if (addrconf_finite_timeout(timeout)) {
3655                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3656                         cfg->fc_flags |= RTF_EXPIRES;
3657                 }
3658         }
3659
3660         err = 0;
3661 errout:
3662         return err;
3663 }
3664
3665 struct rt6_nh {
3666         struct rt6_info *rt6_info;
3667         struct fib6_config r_cfg;
3668         struct mx6_config mxc;
3669         struct list_head next;
3670 };
3671
3672 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3673 {
3674         struct rt6_nh *nh;
3675
3676         list_for_each_entry(nh, rt6_nh_list, next) {
3677                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3678                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3679                         nh->r_cfg.fc_ifindex);
3680         }
3681 }
3682
3683 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3684                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3685 {
3686         struct rt6_nh *nh;
3687         int err = -EEXIST;
3688
3689         list_for_each_entry(nh, rt6_nh_list, next) {
3690                 /* check if rt6_info already exists */
3691                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3692                         return err;
3693         }
3694
3695         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3696         if (!nh)
3697                 return -ENOMEM;
3698         nh->rt6_info = rt;
3699         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3700         if (err) {
3701                 kfree(nh);
3702                 return err;
3703         }
3704         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3705         list_add_tail(&nh->next, rt6_nh_list);
3706
3707         return 0;
3708 }
3709
3710 static void ip6_route_mpath_notify(struct rt6_info *rt,
3711                                    struct rt6_info *rt_last,
3712                                    struct nl_info *info,
3713                                    __u16 nlflags)
3714 {
3715         /* if this is an APPEND route, then rt points to the first route
3716          * inserted and rt_last points to last route inserted. Userspace
3717          * wants a consistent dump of the route which starts at the first
3718          * nexthop. Since sibling routes are always added at the end of
3719          * the list, find the first sibling of the last route appended
3720          */
3721         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3722                 rt = list_first_entry(&rt_last->rt6i_siblings,
3723                                       struct rt6_info,
3724                                       rt6i_siblings);
3725         }
3726
3727         if (rt)
3728                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3729 }
3730
3731 static int ip6_route_multipath_add(struct fib6_config *cfg,
3732                                    struct netlink_ext_ack *extack)
3733 {
3734         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3735         struct nl_info *info = &cfg->fc_nlinfo;
3736         struct fib6_config r_cfg;
3737         struct rtnexthop *rtnh;
3738         struct rt6_info *rt;
3739         struct rt6_nh *err_nh;
3740         struct rt6_nh *nh, *nh_safe;
3741         __u16 nlflags;
3742         int remaining;
3743         int attrlen;
3744         int err = 1;
3745         int nhn = 0;
3746         int replace = (cfg->fc_nlinfo.nlh &&
3747                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3748         LIST_HEAD(rt6_nh_list);
3749
3750         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3751         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3752                 nlflags |= NLM_F_APPEND;
3753
3754         remaining = cfg->fc_mp_len;
3755         rtnh = (struct rtnexthop *)cfg->fc_mp;
3756
3757         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3758          * rt6_info structs per nexthop
3759          */
3760         while (rtnh_ok(rtnh, remaining)) {
3761                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3762                 if (rtnh->rtnh_ifindex)
3763                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3764
3765                 attrlen = rtnh_attrlen(rtnh);
3766                 if (attrlen > 0) {
3767                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3768
3769                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3770                         if (nla) {
3771                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3772                                 r_cfg.fc_flags |= RTF_GATEWAY;
3773                         }
3774                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3775                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3776                         if (nla)
3777                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3778                 }
3779
3780                 rt = ip6_route_info_create(&r_cfg, extack);
3781                 if (IS_ERR(rt)) {
3782                         err = PTR_ERR(rt);
3783                         rt = NULL;
3784                         goto cleanup;
3785                 }
3786
3787                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3788                 if (err) {
3789                         dst_release_immediate(&rt->dst);
3790                         goto cleanup;
3791                 }
3792
3793                 rtnh = rtnh_next(rtnh, &remaining);
3794         }
3795
3796         /* for add and replace send one notification with all nexthops.
3797          * Skip the notification in fib6_add_rt2node and send one with
3798          * the full route when done
3799          */
3800         info->skip_notify = 1;
3801
3802         err_nh = NULL;
3803         list_for_each_entry(nh, &rt6_nh_list, next) {
3804                 rt_last = nh->rt6_info;
3805                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3806                 /* save reference to first route for notification */
3807                 if (!rt_notif && !err)
3808                         rt_notif = nh->rt6_info;
3809
3810                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3811                 nh->rt6_info = NULL;
3812                 if (err) {
3813                         if (replace && nhn)
3814                                 ip6_print_replace_route_err(&rt6_nh_list);
3815                         err_nh = nh;
3816                         goto add_errout;
3817                 }
3818
3819                 /* Because each route is added like a single route we remove
3820                  * these flags after the first nexthop: if there is a collision,
3821                  * we have already failed to add the first nexthop:
3822                  * fib6_add_rt2node() has rejected it; when replacing, old
3823                  * nexthops have been replaced by first new, the rest should
3824                  * be added to it.
3825                  */
3826                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3827                                                      NLM_F_REPLACE);
3828                 nhn++;
3829         }
3830
3831         /* success ... tell user about new route */
3832         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3833         goto cleanup;
3834
3835 add_errout:
3836         /* send notification for routes that were added so that
3837          * the delete notifications sent by ip6_route_del are
3838          * coherent
3839          */
3840         if (rt_notif)
3841                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3842
3843         /* Delete routes that were already added */
3844         list_for_each_entry(nh, &rt6_nh_list, next) {
3845                 if (err_nh == nh)
3846                         break;
3847                 ip6_route_del(&nh->r_cfg, extack);
3848         }
3849
3850 cleanup:
3851         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3852                 if (nh->rt6_info)
3853                         dst_release_immediate(&nh->rt6_info->dst);
3854                 kfree(nh->mxc.mx);
3855                 list_del(&nh->next);
3856                 kfree(nh);
3857         }
3858
3859         return err;
3860 }
3861
3862 static int ip6_route_multipath_del(struct fib6_config *cfg,
3863                                    struct netlink_ext_ack *extack)
3864 {
3865         struct fib6_config r_cfg;
3866         struct rtnexthop *rtnh;
3867         int remaining;
3868         int attrlen;
3869         int err = 1, last_err = 0;
3870
3871         remaining = cfg->fc_mp_len;
3872         rtnh = (struct rtnexthop *)cfg->fc_mp;
3873
3874         /* Parse a Multipath Entry */
3875         while (rtnh_ok(rtnh, remaining)) {
3876                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3877                 if (rtnh->rtnh_ifindex)
3878                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3879
3880                 attrlen = rtnh_attrlen(rtnh);
3881                 if (attrlen > 0) {
3882                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3883
3884                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3885                         if (nla) {
3886                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3887                                 r_cfg.fc_flags |= RTF_GATEWAY;
3888                         }
3889                 }
3890                 err = ip6_route_del(&r_cfg, extack);
3891                 if (err)
3892                         last_err = err;
3893
3894                 rtnh = rtnh_next(rtnh, &remaining);
3895         }
3896
3897         return last_err;
3898 }
3899
3900 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3901                               struct netlink_ext_ack *extack)
3902 {
3903         struct fib6_config cfg;
3904         int err;
3905
3906         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3907         if (err < 0)
3908                 return err;
3909
3910         if (cfg.fc_mp)
3911                 return ip6_route_multipath_del(&cfg, extack);
3912         else {
3913                 cfg.fc_delete_all_nh = 1;
3914                 return ip6_route_del(&cfg, extack);
3915         }
3916 }
3917
3918 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3919                               struct netlink_ext_ack *extack)
3920 {
3921         struct fib6_config cfg;
3922         int err;
3923
3924         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3925         if (err < 0)
3926                 return err;
3927
3928         if (cfg.fc_mp)
3929                 return ip6_route_multipath_add(&cfg, extack);
3930         else
3931                 return ip6_route_add(&cfg, extack);
3932 }
3933
3934 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3935 {
3936         int nexthop_len = 0;
3937
3938         if (rt->rt6i_nsiblings) {
3939                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3940                             + NLA_ALIGN(sizeof(struct rtnexthop))
3941                             + nla_total_size(16) /* RTA_GATEWAY */
3942                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3943
3944                 nexthop_len *= rt->rt6i_nsiblings;
3945         }
3946
3947         return NLMSG_ALIGN(sizeof(struct rtmsg))
3948                + nla_total_size(16) /* RTA_SRC */
3949                + nla_total_size(16) /* RTA_DST */
3950                + nla_total_size(16) /* RTA_GATEWAY */
3951                + nla_total_size(16) /* RTA_PREFSRC */
3952                + nla_total_size(4) /* RTA_TABLE */
3953                + nla_total_size(4) /* RTA_IIF */
3954                + nla_total_size(4) /* RTA_OIF */
3955                + nla_total_size(4) /* RTA_PRIORITY */
3956                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3957                + nla_total_size(sizeof(struct rta_cacheinfo))
3958                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3959                + nla_total_size(1) /* RTA_PREF */
3960                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3961                + nexthop_len;
3962 }
3963
3964 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3965                             unsigned int *flags, bool skip_oif)
3966 {
3967         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3968                 *flags |= RTNH_F_LINKDOWN;
3969                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3970                         *flags |= RTNH_F_DEAD;
3971         }
3972
3973         if (rt->rt6i_flags & RTF_GATEWAY) {
3974                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3975                         goto nla_put_failure;
3976         }
3977
3978         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3979                 *flags |= RTNH_F_OFFLOAD;
3980
3981         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3982         if (!skip_oif && rt->dst.dev &&
3983             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3984                 goto nla_put_failure;
3985
3986         if (rt->dst.lwtstate &&
3987             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3988                 goto nla_put_failure;
3989
3990         return 0;
3991
3992 nla_put_failure:
3993         return -EMSGSIZE;
3994 }
3995
3996 /* add multipath next hop */
3997 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3998 {
3999         struct rtnexthop *rtnh;
4000         unsigned int flags = 0;
4001
4002         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4003         if (!rtnh)
4004                 goto nla_put_failure;
4005
4006         rtnh->rtnh_hops = 0;
4007         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4008
4009         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4010                 goto nla_put_failure;
4011
4012         rtnh->rtnh_flags = flags;
4013
4014         /* length of rtnetlink header + attributes */
4015         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4016
4017         return 0;
4018
4019 nla_put_failure:
4020         return -EMSGSIZE;
4021 }
4022
4023 static int rt6_fill_node(struct net *net,
4024                          struct sk_buff *skb, struct rt6_info *rt,
4025                          struct in6_addr *dst, struct in6_addr *src,
4026                          int iif, int type, u32 portid, u32 seq,
4027                          unsigned int flags)
4028 {
4029         u32 metrics[RTAX_MAX];
4030         struct rtmsg *rtm;
4031         struct nlmsghdr *nlh;
4032         long expires;
4033         u32 table;
4034
4035         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4036         if (!nlh)
4037                 return -EMSGSIZE;
4038
4039         rtm = nlmsg_data(nlh);
4040         rtm->rtm_family = AF_INET6;
4041         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4042         rtm->rtm_src_len = rt->rt6i_src.plen;
4043         rtm->rtm_tos = 0;
4044         if (rt->rt6i_table)
4045                 table = rt->rt6i_table->tb6_id;
4046         else
4047                 table = RT6_TABLE_UNSPEC;
4048         rtm->rtm_table = table;
4049         if (nla_put_u32(skb, RTA_TABLE, table))
4050                 goto nla_put_failure;
4051         if (rt->rt6i_flags & RTF_REJECT) {
4052                 switch (rt->dst.error) {
4053                 case -EINVAL:
4054                         rtm->rtm_type = RTN_BLACKHOLE;
4055                         break;
4056                 case -EACCES:
4057                         rtm->rtm_type = RTN_PROHIBIT;
4058                         break;
4059                 case -EAGAIN:
4060                         rtm->rtm_type = RTN_THROW;
4061                         break;
4062                 default:
4063                         rtm->rtm_type = RTN_UNREACHABLE;
4064                         break;
4065                 }
4066         }
4067         else if (rt->rt6i_flags & RTF_LOCAL)
4068                 rtm->rtm_type = RTN_LOCAL;
4069         else if (rt->rt6i_flags & RTF_ANYCAST)
4070                 rtm->rtm_type = RTN_ANYCAST;
4071         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4072                 rtm->rtm_type = RTN_LOCAL;
4073         else
4074                 rtm->rtm_type = RTN_UNICAST;
4075         rtm->rtm_flags = 0;
4076         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4077         rtm->rtm_protocol = rt->rt6i_protocol;
4078
4079         if (rt->rt6i_flags & RTF_CACHE)
4080                 rtm->rtm_flags |= RTM_F_CLONED;
4081
4082         if (dst) {
4083                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4084                         goto nla_put_failure;
4085                 rtm->rtm_dst_len = 128;
4086         } else if (rtm->rtm_dst_len)
4087                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4088                         goto nla_put_failure;
4089 #ifdef CONFIG_IPV6_SUBTREES
4090         if (src) {
4091                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4092                         goto nla_put_failure;
4093                 rtm->rtm_src_len = 128;
4094         } else if (rtm->rtm_src_len &&
4095                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4096                 goto nla_put_failure;
4097 #endif
4098         if (iif) {
4099 #ifdef CONFIG_IPV6_MROUTE
4100                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4101                         int err = ip6mr_get_route(net, skb, rtm, portid);
4102
4103                         if (err == 0)
4104                                 return 0;
4105                         if (err < 0)
4106                                 goto nla_put_failure;
4107                 } else
4108 #endif
4109                         if (nla_put_u32(skb, RTA_IIF, iif))
4110                                 goto nla_put_failure;
4111         } else if (dst) {
4112                 struct in6_addr saddr_buf;
4113                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4114                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4115                         goto nla_put_failure;
4116         }
4117
4118         if (rt->rt6i_prefsrc.plen) {
4119                 struct in6_addr saddr_buf;
4120                 saddr_buf = rt->rt6i_prefsrc.addr;
4121                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4122                         goto nla_put_failure;
4123         }
4124
4125         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4126         if (rt->rt6i_pmtu)
4127                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4128         if (rtnetlink_put_metrics(skb, metrics) < 0)
4129                 goto nla_put_failure;
4130
4131         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4132                 goto nla_put_failure;
4133
4134         /* For multipath routes, walk the siblings list and add
4135          * each as a nexthop within RTA_MULTIPATH.
4136          */
4137         if (rt->rt6i_nsiblings) {
4138                 struct rt6_info *sibling, *next_sibling;
4139                 struct nlattr *mp;
4140
4141                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4142                 if (!mp)
4143                         goto nla_put_failure;
4144
4145                 if (rt6_add_nexthop(skb, rt) < 0)
4146                         goto nla_put_failure;
4147
4148                 list_for_each_entry_safe(sibling, next_sibling,
4149                                          &rt->rt6i_siblings, rt6i_siblings) {
4150                         if (rt6_add_nexthop(skb, sibling) < 0)
4151                                 goto nla_put_failure;
4152                 }
4153
4154                 nla_nest_end(skb, mp);
4155         } else {
4156                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4157                         goto nla_put_failure;
4158         }
4159
4160         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4161
4162         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4163                 goto nla_put_failure;
4164
4165         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4166                 goto nla_put_failure;
4167
4168
4169         nlmsg_end(skb, nlh);
4170         return 0;
4171
4172 nla_put_failure:
4173         nlmsg_cancel(skb, nlh);
4174         return -EMSGSIZE;
4175 }
4176
4177 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4178 {
4179         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4180         struct net *net = arg->net;
4181
4182         if (rt == net->ipv6.ip6_null_entry)
4183                 return 0;
4184
4185         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4186                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4187
4188                 /* user wants prefix routes only */
4189                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4190                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4191                         /* success since this is not a prefix route */
4192                         return 1;
4193                 }
4194         }
4195
4196         return rt6_fill_node(net,
4197                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4198                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4199                      NLM_F_MULTI);
4200 }
4201
4202 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4203                               struct netlink_ext_ack *extack)
4204 {
4205         struct net *net = sock_net(in_skb->sk);
4206         struct nlattr *tb[RTA_MAX+1];
4207         int err, iif = 0, oif = 0;
4208         struct dst_entry *dst;
4209         struct rt6_info *rt;
4210         struct sk_buff *skb;
4211         struct rtmsg *rtm;
4212         struct flowi6 fl6;
4213         bool fibmatch;
4214
4215         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4216                           extack);
4217         if (err < 0)
4218                 goto errout;
4219
4220         err = -EINVAL;
4221         memset(&fl6, 0, sizeof(fl6));
4222         rtm = nlmsg_data(nlh);
4223         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4224         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4225
4226         if (tb[RTA_SRC]) {
4227                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4228                         goto errout;
4229
4230                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4231         }
4232
4233         if (tb[RTA_DST]) {
4234                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4235                         goto errout;
4236
4237                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4238         }
4239
4240         if (tb[RTA_IIF])
4241                 iif = nla_get_u32(tb[RTA_IIF]);
4242
4243         if (tb[RTA_OIF])
4244                 oif = nla_get_u32(tb[RTA_OIF]);
4245
4246         if (tb[RTA_MARK])
4247                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4248
4249         if (tb[RTA_UID])
4250                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4251                                            nla_get_u32(tb[RTA_UID]));
4252         else
4253                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4254
4255         if (iif) {
4256                 struct net_device *dev;
4257                 int flags = 0;
4258
4259                 rcu_read_lock();
4260
4261                 dev = dev_get_by_index_rcu(net, iif);
4262                 if (!dev) {
4263                         rcu_read_unlock();
4264                         err = -ENODEV;
4265                         goto errout;
4266                 }
4267
4268                 fl6.flowi6_iif = iif;
4269
4270                 if (!ipv6_addr_any(&fl6.saddr))
4271                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4272
4273                 if (!fibmatch)
4274                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4275                 else
4276                         dst = ip6_route_lookup(net, &fl6, 0);
4277
4278                 rcu_read_unlock();
4279         } else {
4280                 fl6.flowi6_oif = oif;
4281
4282                 if (!fibmatch)
4283                         dst = ip6_route_output(net, NULL, &fl6);
4284                 else
4285                         dst = ip6_route_lookup(net, &fl6, 0);
4286         }
4287
4288
4289         rt = container_of(dst, struct rt6_info, dst);
4290         if (rt->dst.error) {
4291                 err = rt->dst.error;
4292                 ip6_rt_put(rt);
4293                 goto errout;
4294         }
4295
4296         if (rt == net->ipv6.ip6_null_entry) {
4297                 err = rt->dst.error;
4298                 ip6_rt_put(rt);
4299                 goto errout;
4300         }
4301
4302         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4303         if (!skb) {
4304                 ip6_rt_put(rt);
4305                 err = -ENOBUFS;
4306                 goto errout;
4307         }
4308
4309         skb_dst_set(skb, &rt->dst);
4310         if (fibmatch)
4311                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4312                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4313                                     nlh->nlmsg_seq, 0);
4314         else
4315                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4316                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4317                                     nlh->nlmsg_seq, 0);
4318         if (err < 0) {
4319                 kfree_skb(skb);
4320                 goto errout;
4321         }
4322
4323         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4324 errout:
4325         return err;
4326 }
4327
4328 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4329                      unsigned int nlm_flags)
4330 {
4331         struct sk_buff *skb;
4332         struct net *net = info->nl_net;
4333         u32 seq;
4334         int err;
4335
4336         err = -ENOBUFS;
4337         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4338
4339         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4340         if (!skb)
4341                 goto errout;
4342
4343         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4344                                 event, info->portid, seq, nlm_flags);
4345         if (err < 0) {
4346                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4347                 WARN_ON(err == -EMSGSIZE);
4348                 kfree_skb(skb);
4349                 goto errout;
4350         }
4351         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4352                     info->nlh, gfp_any());
4353         return;
4354 errout:
4355         if (err < 0)
4356                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4357 }
4358
4359 static int ip6_route_dev_notify(struct notifier_block *this,
4360                                 unsigned long event, void *ptr)
4361 {
4362         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4363         struct net *net = dev_net(dev);
4364
4365         if (!(dev->flags & IFF_LOOPBACK))
4366                 return NOTIFY_OK;
4367
4368         if (event == NETDEV_REGISTER) {
4369                 net->ipv6.ip6_null_entry->dst.dev = dev;
4370                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4371 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4372                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4373                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4374                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4375                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4376 #endif
4377          } else if (event == NETDEV_UNREGISTER &&
4378                     dev->reg_state != NETREG_UNREGISTERED) {
4379                 /* NETDEV_UNREGISTER could be fired for multiple times by
4380                  * netdev_wait_allrefs(). Make sure we only call this once.
4381                  */
4382                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4383 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4384                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4385                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4386 #endif
4387         }
4388
4389         return NOTIFY_OK;
4390 }
4391
4392 /*
4393  *      /proc
4394  */
4395
4396 #ifdef CONFIG_PROC_FS
4397
4398 static const struct file_operations ipv6_route_proc_fops = {
4399         .owner          = THIS_MODULE,
4400         .open           = ipv6_route_open,
4401         .read           = seq_read,
4402         .llseek         = seq_lseek,
4403         .release        = seq_release_net,
4404 };
4405
4406 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4407 {
4408         struct net *net = (struct net *)seq->private;
4409         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4410                    net->ipv6.rt6_stats->fib_nodes,
4411                    net->ipv6.rt6_stats->fib_route_nodes,
4412                    net->ipv6.rt6_stats->fib_rt_alloc,
4413                    net->ipv6.rt6_stats->fib_rt_entries,
4414                    net->ipv6.rt6_stats->fib_rt_cache,
4415                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4416                    net->ipv6.rt6_stats->fib_discarded_routes);
4417
4418         return 0;
4419 }
4420
4421 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4422 {
4423         return single_open_net(inode, file, rt6_stats_seq_show);
4424 }
4425
4426 static const struct file_operations rt6_stats_seq_fops = {
4427         .owner   = THIS_MODULE,
4428         .open    = rt6_stats_seq_open,
4429         .read    = seq_read,
4430         .llseek  = seq_lseek,
4431         .release = single_release_net,
4432 };
4433 #endif  /* CONFIG_PROC_FS */
4434
4435 #ifdef CONFIG_SYSCTL
4436
4437 static
4438 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4439                               void __user *buffer, size_t *lenp, loff_t *ppos)
4440 {
4441         struct net *net;
4442         int delay;
4443         if (!write)
4444                 return -EINVAL;
4445
4446         net = (struct net *)ctl->extra1;
4447         delay = net->ipv6.sysctl.flush_delay;
4448         proc_dointvec(ctl, write, buffer, lenp, ppos);
4449         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4450         return 0;
4451 }
4452
4453 struct ctl_table ipv6_route_table_template[] = {
4454         {
4455                 .procname       =       "flush",
4456                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4457                 .maxlen         =       sizeof(int),
4458                 .mode           =       0200,
4459                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4460         },
4461         {
4462                 .procname       =       "gc_thresh",
4463                 .data           =       &ip6_dst_ops_template.gc_thresh,
4464                 .maxlen         =       sizeof(int),
4465                 .mode           =       0644,
4466                 .proc_handler   =       proc_dointvec,
4467         },
4468         {
4469                 .procname       =       "max_size",
4470                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4471                 .maxlen         =       sizeof(int),
4472                 .mode           =       0644,
4473                 .proc_handler   =       proc_dointvec,
4474         },
4475         {
4476                 .procname       =       "gc_min_interval",
4477                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4478                 .maxlen         =       sizeof(int),
4479                 .mode           =       0644,
4480                 .proc_handler   =       proc_dointvec_jiffies,
4481         },
4482         {
4483                 .procname       =       "gc_timeout",
4484                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4485                 .maxlen         =       sizeof(int),
4486                 .mode           =       0644,
4487                 .proc_handler   =       proc_dointvec_jiffies,
4488         },
4489         {
4490                 .procname       =       "gc_interval",
4491                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4492                 .maxlen         =       sizeof(int),
4493                 .mode           =       0644,
4494                 .proc_handler   =       proc_dointvec_jiffies,
4495         },
4496         {
4497                 .procname       =       "gc_elasticity",
4498                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4499                 .maxlen         =       sizeof(int),
4500                 .mode           =       0644,
4501                 .proc_handler   =       proc_dointvec,
4502         },
4503         {
4504                 .procname       =       "mtu_expires",
4505                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4506                 .maxlen         =       sizeof(int),
4507                 .mode           =       0644,
4508                 .proc_handler   =       proc_dointvec_jiffies,
4509         },
4510         {
4511                 .procname       =       "min_adv_mss",
4512                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4513                 .maxlen         =       sizeof(int),
4514                 .mode           =       0644,
4515                 .proc_handler   =       proc_dointvec,
4516         },
4517         {
4518                 .procname       =       "gc_min_interval_ms",
4519                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4520                 .maxlen         =       sizeof(int),
4521                 .mode           =       0644,
4522                 .proc_handler   =       proc_dointvec_ms_jiffies,
4523         },
4524         { }
4525 };
4526
4527 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4528 {
4529         struct ctl_table *table;
4530
4531         table = kmemdup(ipv6_route_table_template,
4532                         sizeof(ipv6_route_table_template),
4533                         GFP_KERNEL);
4534
4535         if (table) {
4536                 table[0].data = &net->ipv6.sysctl.flush_delay;
4537                 table[0].extra1 = net;
4538                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4539                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4540                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4541                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4542                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4543                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4544                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4545                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4546                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4547
4548                 /* Don't export sysctls to unprivileged users */
4549                 if (net->user_ns != &init_user_ns)
4550                         table[0].procname = NULL;
4551         }
4552
4553         return table;
4554 }
4555 #endif
4556
4557 static int __net_init ip6_route_net_init(struct net *net)
4558 {
4559         int ret = -ENOMEM;
4560
4561         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4562                sizeof(net->ipv6.ip6_dst_ops));
4563
4564         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4565                 goto out_ip6_dst_ops;
4566
4567         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4568                                            sizeof(*net->ipv6.ip6_null_entry),
4569                                            GFP_KERNEL);
4570         if (!net->ipv6.ip6_null_entry)
4571                 goto out_ip6_dst_entries;
4572         net->ipv6.ip6_null_entry->dst.path =
4573                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4574         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4575         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4576                          ip6_template_metrics, true);
4577
4578 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4579         net->ipv6.fib6_has_custom_rules = false;
4580         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4581                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4582                                                GFP_KERNEL);
4583         if (!net->ipv6.ip6_prohibit_entry)
4584                 goto out_ip6_null_entry;
4585         net->ipv6.ip6_prohibit_entry->dst.path =
4586                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4587         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4588         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4589                          ip6_template_metrics, true);
4590
4591         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4592                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4593                                                GFP_KERNEL);
4594         if (!net->ipv6.ip6_blk_hole_entry)
4595                 goto out_ip6_prohibit_entry;
4596         net->ipv6.ip6_blk_hole_entry->dst.path =
4597                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4598         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4599         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4600                          ip6_template_metrics, true);
4601 #endif
4602
4603         net->ipv6.sysctl.flush_delay = 0;
4604         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4605         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4606         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4607         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4608         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4609         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4610         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4611
4612         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4613
4614         ret = 0;
4615 out:
4616         return ret;
4617
4618 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4619 out_ip6_prohibit_entry:
4620         kfree(net->ipv6.ip6_prohibit_entry);
4621 out_ip6_null_entry:
4622         kfree(net->ipv6.ip6_null_entry);
4623 #endif
4624 out_ip6_dst_entries:
4625         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4626 out_ip6_dst_ops:
4627         goto out;
4628 }
4629
4630 static void __net_exit ip6_route_net_exit(struct net *net)
4631 {
4632         kfree(net->ipv6.ip6_null_entry);
4633 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4634         kfree(net->ipv6.ip6_prohibit_entry);
4635         kfree(net->ipv6.ip6_blk_hole_entry);
4636 #endif
4637         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4638 }
4639
4640 static int __net_init ip6_route_net_init_late(struct net *net)
4641 {
4642 #ifdef CONFIG_PROC_FS
4643         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4644         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4645 #endif
4646         return 0;
4647 }
4648
4649 static void __net_exit ip6_route_net_exit_late(struct net *net)
4650 {
4651 #ifdef CONFIG_PROC_FS
4652         remove_proc_entry("ipv6_route", net->proc_net);
4653         remove_proc_entry("rt6_stats", net->proc_net);
4654 #endif
4655 }
4656
4657 static struct pernet_operations ip6_route_net_ops = {
4658         .init = ip6_route_net_init,
4659         .exit = ip6_route_net_exit,
4660 };
4661
4662 static int __net_init ipv6_inetpeer_init(struct net *net)
4663 {
4664         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4665
4666         if (!bp)
4667                 return -ENOMEM;
4668         inet_peer_base_init(bp);
4669         net->ipv6.peers = bp;
4670         return 0;
4671 }
4672
4673 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4674 {
4675         struct inet_peer_base *bp = net->ipv6.peers;
4676
4677         net->ipv6.peers = NULL;
4678         inetpeer_invalidate_tree(bp);
4679         kfree(bp);
4680 }
4681
4682 static struct pernet_operations ipv6_inetpeer_ops = {
4683         .init   =       ipv6_inetpeer_init,
4684         .exit   =       ipv6_inetpeer_exit,
4685 };
4686
4687 static struct pernet_operations ip6_route_net_late_ops = {
4688         .init = ip6_route_net_init_late,
4689         .exit = ip6_route_net_exit_late,
4690 };
4691
4692 static struct notifier_block ip6_route_dev_notifier = {
4693         .notifier_call = ip6_route_dev_notify,
4694         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4695 };
4696
4697 void __init ip6_route_init_special_entries(void)
4698 {
4699         /* Registering of the loopback is done before this portion of code,
4700          * the loopback reference in rt6_info will not be taken, do it
4701          * manually for init_net */
4702         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4703         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4704   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4705         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4706         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4707         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4708         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4709   #endif
4710 }
4711
4712 int __init ip6_route_init(void)
4713 {
4714         int ret;
4715         int cpu;
4716
4717         ret = -ENOMEM;
4718         ip6_dst_ops_template.kmem_cachep =
4719                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4720                                   SLAB_HWCACHE_ALIGN, NULL);
4721         if (!ip6_dst_ops_template.kmem_cachep)
4722                 goto out;
4723
4724         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4725         if (ret)
4726                 goto out_kmem_cache;
4727
4728         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4729         if (ret)
4730                 goto out_dst_entries;
4731
4732         ret = register_pernet_subsys(&ip6_route_net_ops);
4733         if (ret)
4734                 goto out_register_inetpeer;
4735
4736         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4737
4738         ret = fib6_init();
4739         if (ret)
4740                 goto out_register_subsys;
4741
4742         ret = xfrm6_init();
4743         if (ret)
4744                 goto out_fib6_init;
4745
4746         ret = fib6_rules_init();
4747         if (ret)
4748                 goto xfrm6_init;
4749
4750         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4751         if (ret)
4752                 goto fib6_rules_init;
4753
4754         ret = -ENOBUFS;
4755         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4756             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4757             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4758                             RTNL_FLAG_DOIT_UNLOCKED))
4759                 goto out_register_late_subsys;
4760
4761         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4762         if (ret)
4763                 goto out_register_late_subsys;
4764
4765         for_each_possible_cpu(cpu) {
4766                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4767
4768                 INIT_LIST_HEAD(&ul->head);
4769                 spin_lock_init(&ul->lock);
4770         }
4771
4772 out:
4773         return ret;
4774
4775 out_register_late_subsys:
4776         unregister_pernet_subsys(&ip6_route_net_late_ops);
4777 fib6_rules_init:
4778         fib6_rules_cleanup();
4779 xfrm6_init:
4780         xfrm6_fini();
4781 out_fib6_init:
4782         fib6_gc_cleanup();
4783 out_register_subsys:
4784         unregister_pernet_subsys(&ip6_route_net_ops);
4785 out_register_inetpeer:
4786         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4787 out_dst_entries:
4788         dst_entries_destroy(&ip6_dst_blackhole_ops);
4789 out_kmem_cache:
4790         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4791         goto out;
4792 }
4793
4794 void ip6_route_cleanup(void)
4795 {
4796         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4797         unregister_pernet_subsys(&ip6_route_net_late_ops);
4798         fib6_rules_cleanup();
4799         xfrm6_fini();
4800         fib6_gc_cleanup();
4801         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4802         unregister_pernet_subsys(&ip6_route_net_ops);
4803         dst_entries_destroy(&ip6_dst_blackhole_ops);
4804         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4805 }