]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
Merge branch 'i2c/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454                                              struct rt6_info *match,
455                                              struct flowi6 *fl6, int oif,
456                                              const struct sk_buff *skb,
457                                              int strict)
458 {
459         struct rt6_info *sibling, *next_sibling;
460
461         /* We might have already computed the hash for ICMPv6 errors. In such
462          * case it will always be non-zero. Otherwise now is the time to do it.
463          */
464         if (!fl6->mp_hash)
465                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468                 return match;
469
470         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471                                  rt6i_siblings) {
472                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473                         continue;
474                 if (rt6_score_route(sibling, oif, strict) < 0)
475                         break;
476                 match = sibling;
477                 break;
478         }
479
480         return match;
481 }
482
483 /*
484  *      Route lookup. rcu_read_lock() should be held.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497                 return rt;
498
499         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503                         continue;
504
505                 if (oif) {
506                         if (dev->ifindex == oif)
507                                 return sprt;
508                         if (dev->flags & IFF_LOOPBACK) {
509                                 if (!sprt->rt6i_idev ||
510                                     sprt->rt6i_idev->dev->ifindex != oif) {
511                                         if (flags & RT6_LOOKUP_F_IFACE)
512                                                 continue;
513                                         if (local &&
514                                             local->rt6i_idev->dev->ifindex == oif)
515                                                 continue;
516                                 }
517                                 local = sprt;
518                         }
519                 } else {
520                         if (ipv6_chk_addr(net, saddr, dev,
521                                           flags & RT6_LOOKUP_F_IFACE))
522                                 return sprt;
523                 }
524         }
525
526         if (oif) {
527                 if (local)
528                         return local;
529
530                 if (flags & RT6_LOOKUP_F_IFACE)
531                         return net->ipv6.ip6_null_entry;
532         }
533
534         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539         struct work_struct work;
540         struct in6_addr target;
541         struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546         struct in6_addr mcaddr;
547         struct __rt6_probe_work *work =
548                 container_of(w, struct __rt6_probe_work, work);
549
550         addrconf_addr_solict_mult(&work->target, &mcaddr);
551         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552         dev_put(work->dev);
553         kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558         struct __rt6_probe_work *work;
559         struct neighbour *neigh;
560         /*
561          * Okay, this does not seem to be appropriate
562          * for now, however, we need to check if it
563          * is really so; aka Router Reachability Probing.
564          *
565          * Router Reachability Probe MUST be rate-limited
566          * to no more than one per minute.
567          */
568         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569                 return;
570         rcu_read_lock_bh();
571         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 work = NULL;
577                 write_lock(&neigh->lock);
578                 if (!(neigh->nud_state & NUD_VALID) &&
579                     time_after(jiffies,
580                                neigh->updated +
581                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
582                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
583                         if (work)
584                                 __neigh_set_probe_once(neigh);
585                 }
586                 write_unlock(&neigh->lock);
587         } else {
588                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589         }
590
591         if (work) {
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = rt->rt6i_gateway;
594                 dev_hold(rt->dst.dev);
595                 work->dev = rt->dst.dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613         struct net_device *dev = rt->dst.dev;
614         if (!oif || dev->ifindex == oif)
615                 return 2;
616         if ((dev->flags & IFF_LOOPBACK) &&
617             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618                 return 1;
619         return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624         struct neighbour *neigh;
625         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627         if (rt->rt6i_flags & RTF_NONEXTHOP ||
628             !(rt->rt6i_flags & RTF_GATEWAY))
629                 return RT6_NUD_SUCCEED;
630
631         rcu_read_lock_bh();
632         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633         if (neigh) {
634                 read_lock(&neigh->lock);
635                 if (neigh->nud_state & NUD_VALID)
636                         ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638                 else if (!(neigh->nud_state & NUD_FAILED))
639                         ret = RT6_NUD_SUCCEED;
640                 else
641                         ret = RT6_NUD_FAIL_PROBE;
642 #endif
643                 read_unlock(&neigh->lock);
644         } else {
645                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647         }
648         rcu_read_unlock_bh();
649
650         return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654                            int strict)
655 {
656         int m;
657
658         m = rt6_check_dev(rt, oif);
659         if (!m && (strict & RT6_LOOKUP_F_IFACE))
660                 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664         if (strict & RT6_LOOKUP_F_REACHABLE) {
665                 int n = rt6_check_neigh(rt);
666                 if (n < 0)
667                         return n;
668         }
669         return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673                                    int *mpri, struct rt6_info *match,
674                                    bool *do_rr)
675 {
676         int m;
677         bool match_do_rr = false;
678         struct inet6_dev *idev = rt->rt6i_idev;
679
680         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681                 goto out;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6,
920                                              const struct sk_buff *skb,
921                                              int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927                 flags &= ~RT6_LOOKUP_F_IFACE;
928
929         rcu_read_lock();
930         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932         rt = rcu_dereference(fn->leaf);
933         if (!rt) {
934                 rt = net->ipv6.ip6_null_entry;
935         } else {
936                 rt = rt6_device_match(net, rt, &fl6->saddr,
937                                       fl6->flowi6_oif, flags);
938                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939                         rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940                                                   skb, flags);
941         }
942         if (rt == net->ipv6.ip6_null_entry) {
943                 fn = fib6_backtrack(fn, &fl6->saddr);
944                 if (fn)
945                         goto restart;
946         }
947         /* Search through exception table */
948         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949         if (rt_cache)
950                 rt = rt_cache;
951
952         if (ip6_hold_safe(net, &rt, true))
953                 dst_use_noref(&rt->dst, jiffies);
954
955         rcu_read_unlock();
956
957         trace_fib6_table_lookup(net, rt, table, fl6);
958
959         return rt;
960
961 }
962
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964                                    const struct sk_buff *skb, int flags)
965 {
966         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971                             const struct in6_addr *saddr, int oif,
972                             const struct sk_buff *skb, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = ort->from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         BUG_ON(prev);
1140
1141         rt6_dst_from_metrics_check(pcpu_rt);
1142         return pcpu_rt;
1143 }
1144
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153                                  struct rt6_exception *rt6_ex)
1154 {
1155         struct net *net;
1156
1157         if (!bucket || !rt6_ex)
1158                 return;
1159
1160         net = dev_net(rt6_ex->rt6i->dst.dev);
1161         rt6_ex->rt6i->rt6i_node = NULL;
1162         hlist_del_rcu(&rt6_ex->hlist);
1163         rt6_release(rt6_ex->rt6i);
1164         kfree_rcu(rt6_ex, rcu);
1165         WARN_ON_ONCE(!bucket->depth);
1166         bucket->depth--;
1167         net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175         struct rt6_exception *rt6_ex, *oldest = NULL;
1176
1177         if (!bucket)
1178                 return;
1179
1180         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182                         oldest = rt6_ex;
1183         }
1184         rt6_remove_exception(bucket, oldest);
1185 }
1186
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188                               const struct in6_addr *src)
1189 {
1190         static u32 seed __read_mostly;
1191         u32 val;
1192
1193         net_get_random_once(&seed, sizeof(seed));
1194         val = jhash(dst, sizeof(*dst), seed);
1195
1196 #ifdef CONFIG_IPV6_SUBTREES
1197         if (src)
1198                 val = jhash(src, sizeof(*src), val);
1199 #endif
1200         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210                               const struct in6_addr *daddr,
1211                               const struct in6_addr *saddr)
1212 {
1213         struct rt6_exception *rt6_ex;
1214         u32 hval;
1215
1216         if (!(*bucket) || !daddr)
1217                 return NULL;
1218
1219         hval = rt6_exception_hash(daddr, saddr);
1220         *bucket += hval;
1221
1222         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223                 struct rt6_info *rt6 = rt6_ex->rt6i;
1224                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225
1226 #ifdef CONFIG_IPV6_SUBTREES
1227                 if (matched && saddr)
1228                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230                 if (matched)
1231                         return rt6_ex;
1232         }
1233         return NULL;
1234 }
1235
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243                          const struct in6_addr *daddr,
1244                          const struct in6_addr *saddr)
1245 {
1246         struct rt6_exception *rt6_ex;
1247         u32 hval;
1248
1249         WARN_ON_ONCE(!rcu_read_lock_held());
1250
1251         if (!(*bucket) || !daddr)
1252                 return NULL;
1253
1254         hval = rt6_exception_hash(daddr, saddr);
1255         *bucket += hval;
1256
1257         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258                 struct rt6_info *rt6 = rt6_ex->rt6i;
1259                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260
1261 #ifdef CONFIG_IPV6_SUBTREES
1262                 if (matched && saddr)
1263                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265                 if (matched)
1266                         return rt6_ex;
1267         }
1268         return NULL;
1269 }
1270
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272                                 struct rt6_info *ort)
1273 {
1274         struct net *net = dev_net(ort->dst.dev);
1275         struct rt6_exception_bucket *bucket;
1276         struct in6_addr *src_key = NULL;
1277         struct rt6_exception *rt6_ex;
1278         int err = 0;
1279
1280         /* ort can't be a cache or pcpu route */
1281         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282                 ort = ort->from;
1283         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285         spin_lock_bh(&rt6_exception_lock);
1286
1287         if (ort->exception_bucket_flushed) {
1288                 err = -EINVAL;
1289                 goto out;
1290         }
1291
1292         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293                                         lockdep_is_held(&rt6_exception_lock));
1294         if (!bucket) {
1295                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296                                  GFP_ATOMIC);
1297                 if (!bucket) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302         }
1303
1304 #ifdef CONFIG_IPV6_SUBTREES
1305         /* rt6i_src.plen != 0 indicates ort is in subtree
1306          * and exception table is indexed by a hash of
1307          * both rt6i_dst and rt6i_src.
1308          * Otherwise, the exception table is indexed by
1309          * a hash of only rt6i_dst.
1310          */
1311         if (ort->rt6i_src.plen)
1312                 src_key = &nrt->rt6i_src.addr;
1313 #endif
1314
1315         /* Update rt6i_prefsrc as it could be changed
1316          * in rt6_remove_prefsrc()
1317          */
1318         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319         /* rt6_mtu_change() might lower mtu on ort.
1320          * Only insert this exception route if its mtu
1321          * is less than ort's mtu value.
1322          */
1323         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329                                                src_key);
1330         if (rt6_ex)
1331                 rt6_remove_exception(bucket, rt6_ex);
1332
1333         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334         if (!rt6_ex) {
1335                 err = -ENOMEM;
1336                 goto out;
1337         }
1338         rt6_ex->rt6i = nrt;
1339         rt6_ex->stamp = jiffies;
1340         atomic_inc(&nrt->rt6i_ref);
1341         nrt->rt6i_node = ort->rt6i_node;
1342         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343         bucket->depth++;
1344         net->ipv6.rt6_stats->fib_rt_cache++;
1345
1346         if (bucket->depth > FIB6_MAX_DEPTH)
1347                 rt6_exception_remove_oldest(bucket);
1348
1349 out:
1350         spin_unlock_bh(&rt6_exception_lock);
1351
1352         /* Update fn->fn_sernum to invalidate all cached dst */
1353         if (!err) {
1354                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355                 fib6_update_sernum(ort);
1356                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357                 fib6_force_start_gc(net);
1358         }
1359
1360         return err;
1361 }
1362
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365         struct rt6_exception_bucket *bucket;
1366         struct rt6_exception *rt6_ex;
1367         struct hlist_node *tmp;
1368         int i;
1369
1370         spin_lock_bh(&rt6_exception_lock);
1371         /* Prevent rt6_insert_exception() to recreate the bucket list */
1372         rt->exception_bucket_flushed = 1;
1373
1374         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375                                     lockdep_is_held(&rt6_exception_lock));
1376         if (!bucket)
1377                 goto out;
1378
1379         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381                         rt6_remove_exception(bucket, rt6_ex);
1382                 WARN_ON_ONCE(bucket->depth);
1383                 bucket++;
1384         }
1385
1386 out:
1387         spin_unlock_bh(&rt6_exception_lock);
1388 }
1389
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394                                            struct in6_addr *daddr,
1395                                            struct in6_addr *saddr)
1396 {
1397         struct rt6_exception_bucket *bucket;
1398         struct in6_addr *src_key = NULL;
1399         struct rt6_exception *rt6_ex;
1400         struct rt6_info *res = NULL;
1401
1402         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403
1404 #ifdef CONFIG_IPV6_SUBTREES
1405         /* rt6i_src.plen != 0 indicates rt is in subtree
1406          * and exception table is indexed by a hash of
1407          * both rt6i_dst and rt6i_src.
1408          * Otherwise, the exception table is indexed by
1409          * a hash of only rt6i_dst.
1410          */
1411         if (rt->rt6i_src.plen)
1412                 src_key = saddr;
1413 #endif
1414         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415
1416         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417                 res = rt6_ex->rt6i;
1418
1419         return res;
1420 }
1421
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425         struct rt6_exception_bucket *bucket;
1426         struct rt6_info *from = rt->from;
1427         struct in6_addr *src_key = NULL;
1428         struct rt6_exception *rt6_ex;
1429         int err;
1430
1431         if (!from ||
1432             !(rt->rt6i_flags & RTF_CACHE))
1433                 return -EINVAL;
1434
1435         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436                 return -ENOENT;
1437
1438         spin_lock_bh(&rt6_exception_lock);
1439         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440                                     lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (from->rt6i_src.plen)
1449                 src_key = &rt->rt6i_src.addr;
1450 #endif
1451         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452                                                &rt->rt6i_dst.addr,
1453                                                src_key);
1454         if (rt6_ex) {
1455                 rt6_remove_exception(bucket, rt6_ex);
1456                 err = 0;
1457         } else {
1458                 err = -ENOENT;
1459         }
1460
1461         spin_unlock_bh(&rt6_exception_lock);
1462         return err;
1463 }
1464
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470         struct rt6_exception_bucket *bucket;
1471         struct rt6_info *from = rt->from;
1472         struct in6_addr *src_key = NULL;
1473         struct rt6_exception *rt6_ex;
1474
1475         if (!from ||
1476             !(rt->rt6i_flags & RTF_CACHE))
1477                 return;
1478
1479         rcu_read_lock();
1480         bucket = rcu_dereference(from->rt6i_exception_bucket);
1481
1482 #ifdef CONFIG_IPV6_SUBTREES
1483         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484          * and exception table is indexed by a hash of
1485          * both rt6i_dst and rt6i_src.
1486          * Otherwise, the exception table is indexed by
1487          * a hash of only rt6i_dst.
1488          */
1489         if (from->rt6i_src.plen)
1490                 src_key = &rt->rt6i_src.addr;
1491 #endif
1492         rt6_ex = __rt6_find_exception_rcu(&bucket,
1493                                           &rt->rt6i_dst.addr,
1494                                           src_key);
1495         if (rt6_ex)
1496                 rt6_ex->stamp = jiffies;
1497
1498         rcu_read_unlock();
1499 }
1500
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503         struct rt6_exception_bucket *bucket;
1504         struct rt6_exception *rt6_ex;
1505         int i;
1506
1507         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508                                         lockdep_is_held(&rt6_exception_lock));
1509
1510         if (bucket) {
1511                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514                         }
1515                         bucket++;
1516                 }
1517         }
1518 }
1519
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521                                          struct rt6_info *rt, int mtu)
1522 {
1523         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524          * lowest MTU in the path: always allow updating the route PMTU to
1525          * reflect PMTU decreases.
1526          *
1527          * If the new MTU is higher, and the route PMTU is equal to the local
1528          * MTU, this means the old MTU is the lowest in the path, so allow
1529          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530          * handle this.
1531          */
1532
1533         if (dst_mtu(&rt->dst) >= mtu)
1534                 return true;
1535
1536         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537                 return true;
1538
1539         return false;
1540 }
1541
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543                                        struct rt6_info *rt, int mtu)
1544 {
1545         struct rt6_exception_bucket *bucket;
1546         struct rt6_exception *rt6_ex;
1547         int i;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                         lockdep_is_held(&rt6_exception_lock));
1551
1552         if (!bucket)
1553                 return;
1554
1555         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557                         struct rt6_info *entry = rt6_ex->rt6i;
1558
1559                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560                          * route), the metrics of its rt->dst.from have already
1561                          * been updated.
1562                          */
1563                         if (entry->rt6i_pmtu &&
1564                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1565                                 entry->rt6i_pmtu = mtu;
1566                 }
1567                 bucket++;
1568         }
1569 }
1570
1571 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1572
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574                                         struct in6_addr *gateway)
1575 {
1576         struct rt6_exception_bucket *bucket;
1577         struct rt6_exception *rt6_ex;
1578         struct hlist_node *tmp;
1579         int i;
1580
1581         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582                 return;
1583
1584         spin_lock_bh(&rt6_exception_lock);
1585         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586                                      lockdep_is_held(&rt6_exception_lock));
1587
1588         if (bucket) {
1589                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590                         hlist_for_each_entry_safe(rt6_ex, tmp,
1591                                                   &bucket->chain, hlist) {
1592                                 struct rt6_info *entry = rt6_ex->rt6i;
1593
1594                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595                                     RTF_CACHE_GATEWAY &&
1596                                     ipv6_addr_equal(gateway,
1597                                                     &entry->rt6i_gateway)) {
1598                                         rt6_remove_exception(bucket, rt6_ex);
1599                                 }
1600                         }
1601                         bucket++;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&rt6_exception_lock);
1606 }
1607
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609                                       struct rt6_exception *rt6_ex,
1610                                       struct fib6_gc_args *gc_args,
1611                                       unsigned long now)
1612 {
1613         struct rt6_info *rt = rt6_ex->rt6i;
1614
1615         /* we are pruning and obsoleting aged-out and non gateway exceptions
1616          * even if others have still references to them, so that on next
1617          * dst_check() such references can be dropped.
1618          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619          * expired, independently from their aging, as per RFC 8201 section 4
1620          */
1621         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623                         RT6_TRACE("aging clone %p\n", rt);
1624                         rt6_remove_exception(bucket, rt6_ex);
1625                         return;
1626                 }
1627         } else if (time_after(jiffies, rt->dst.expires)) {
1628                 RT6_TRACE("purging expired route %p\n", rt);
1629                 rt6_remove_exception(bucket, rt6_ex);
1630                 return;
1631         }
1632
1633         if (rt->rt6i_flags & RTF_GATEWAY) {
1634                 struct neighbour *neigh;
1635                 __u8 neigh_flags = 0;
1636
1637                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638                 if (neigh)
1639                         neigh_flags = neigh->flags;
1640
1641                 if (!(neigh_flags & NTF_ROUTER)) {
1642                         RT6_TRACE("purging route %p via non-router but gateway\n",
1643                                   rt);
1644                         rt6_remove_exception(bucket, rt6_ex);
1645                         return;
1646                 }
1647         }
1648
1649         gc_args->more++;
1650 }
1651
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653                         struct fib6_gc_args *gc_args,
1654                         unsigned long now)
1655 {
1656         struct rt6_exception_bucket *bucket;
1657         struct rt6_exception *rt6_ex;
1658         struct hlist_node *tmp;
1659         int i;
1660
1661         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662                 return;
1663
1664         rcu_read_lock_bh();
1665         spin_lock(&rt6_exception_lock);
1666         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667                                     lockdep_is_held(&rt6_exception_lock));
1668
1669         if (bucket) {
1670                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671                         hlist_for_each_entry_safe(rt6_ex, tmp,
1672                                                   &bucket->chain, hlist) {
1673                                 rt6_age_examine_exception(bucket, rt6_ex,
1674                                                           gc_args, now);
1675                         }
1676                         bucket++;
1677                 }
1678         }
1679         spin_unlock(&rt6_exception_lock);
1680         rcu_read_unlock_bh();
1681 }
1682
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684                                int oif, struct flowi6 *fl6,
1685                                const struct sk_buff *skb, int flags)
1686 {
1687         struct fib6_node *fn, *saved_fn;
1688         struct rt6_info *rt, *rt_cache;
1689         int strict = 0;
1690
1691         strict |= flags & RT6_LOOKUP_F_IFACE;
1692         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693         if (net->ipv6.devconf_all->forwarding == 0)
1694                 strict |= RT6_LOOKUP_F_REACHABLE;
1695
1696         rcu_read_lock();
1697
1698         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699         saved_fn = fn;
1700
1701         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702                 oif = 0;
1703
1704 redo_rt6_select:
1705         rt = rt6_select(net, fn, oif, strict);
1706         if (rt->rt6i_nsiblings)
1707                 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708         if (rt == net->ipv6.ip6_null_entry) {
1709                 fn = fib6_backtrack(fn, &fl6->saddr);
1710                 if (fn)
1711                         goto redo_rt6_select;
1712                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713                         /* also consider unreachable route */
1714                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1715                         fn = saved_fn;
1716                         goto redo_rt6_select;
1717                 }
1718         }
1719
1720         /*Search through exception table */
1721         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722         if (rt_cache)
1723                 rt = rt_cache;
1724
1725         if (rt == net->ipv6.ip6_null_entry) {
1726                 rcu_read_unlock();
1727                 dst_hold(&rt->dst);
1728                 trace_fib6_table_lookup(net, rt, table, fl6);
1729                 return rt;
1730         } else if (rt->rt6i_flags & RTF_CACHE) {
1731                 if (ip6_hold_safe(net, &rt, true)) {
1732                         dst_use_noref(&rt->dst, jiffies);
1733                         rt6_dst_from_metrics_check(rt);
1734                 }
1735                 rcu_read_unlock();
1736                 trace_fib6_table_lookup(net, rt, table, fl6);
1737                 return rt;
1738         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1740                 /* Create a RTF_CACHE clone which will not be
1741                  * owned by the fib6 tree.  It is for the special case where
1742                  * the daddr in the skb during the neighbor look-up is different
1743                  * from the fl6->daddr used to look-up route here.
1744                  */
1745
1746                 struct rt6_info *uncached_rt;
1747
1748                 if (ip6_hold_safe(net, &rt, true)) {
1749                         dst_use_noref(&rt->dst, jiffies);
1750                 } else {
1751                         rcu_read_unlock();
1752                         uncached_rt = rt;
1753                         goto uncached_rt_out;
1754                 }
1755                 rcu_read_unlock();
1756
1757                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758                 dst_release(&rt->dst);
1759
1760                 if (uncached_rt) {
1761                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762                          * No need for another dst_hold()
1763                          */
1764                         rt6_uncached_list_add(uncached_rt);
1765                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766                 } else {
1767                         uncached_rt = net->ipv6.ip6_null_entry;
1768                         dst_hold(&uncached_rt->dst);
1769                 }
1770
1771 uncached_rt_out:
1772                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773                 return uncached_rt;
1774
1775         } else {
1776                 /* Get a percpu copy */
1777
1778                 struct rt6_info *pcpu_rt;
1779
1780                 dst_use_noref(&rt->dst, jiffies);
1781                 local_bh_disable();
1782                 pcpu_rt = rt6_get_pcpu_route(rt);
1783
1784                 if (!pcpu_rt) {
1785                         /* atomic_inc_not_zero() is needed when using rcu */
1786                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787                                 /* No dst_hold() on rt is needed because grabbing
1788                                  * rt->rt6i_ref makes sure rt can't be released.
1789                                  */
1790                                 pcpu_rt = rt6_make_pcpu_route(rt);
1791                                 rt6_release(rt);
1792                         } else {
1793                                 /* rt is already removed from tree */
1794                                 pcpu_rt = net->ipv6.ip6_null_entry;
1795                                 dst_hold(&pcpu_rt->dst);
1796                         }
1797                 }
1798                 local_bh_enable();
1799                 rcu_read_unlock();
1800                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801                 return pcpu_rt;
1802         }
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807                                             struct fib6_table *table,
1808                                             struct flowi6 *fl6,
1809                                             const struct sk_buff *skb,
1810                                             int flags)
1811 {
1812         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816                                          struct net_device *dev,
1817                                          struct flowi6 *fl6,
1818                                          const struct sk_buff *skb,
1819                                          int flags)
1820 {
1821         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822                 flags |= RT6_LOOKUP_F_IFACE;
1823
1824         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829                                   struct flow_keys *keys,
1830                                   struct flow_keys *flkeys)
1831 {
1832         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833         const struct ipv6hdr *key_iph = outer_iph;
1834         struct flow_keys *_flkeys = flkeys;
1835         const struct ipv6hdr *inner_iph;
1836         const struct icmp6hdr *icmph;
1837         struct ipv6hdr _inner_iph;
1838
1839         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840                 goto out;
1841
1842         icmph = icmp6_hdr(skb);
1843         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846             icmph->icmp6_type != ICMPV6_PARAMPROB)
1847                 goto out;
1848
1849         inner_iph = skb_header_pointer(skb,
1850                                        skb_transport_offset(skb) + sizeof(*icmph),
1851                                        sizeof(_inner_iph), &_inner_iph);
1852         if (!inner_iph)
1853                 goto out;
1854
1855         key_iph = inner_iph;
1856         _flkeys = NULL;
1857 out:
1858         if (_flkeys) {
1859                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861                 keys->tags.flow_label = _flkeys->tags.flow_label;
1862                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863         } else {
1864                 keys->addrs.v6addrs.src = key_iph->saddr;
1865                 keys->addrs.v6addrs.dst = key_iph->daddr;
1866                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867                 keys->basic.ip_proto = key_iph->nexthdr;
1868         }
1869 }
1870
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873                        const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875         struct flow_keys hash_keys;
1876         u32 mhash;
1877
1878         switch (ip6_multipath_hash_policy(net)) {
1879         case 0:
1880                 memset(&hash_keys, 0, sizeof(hash_keys));
1881                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882                 if (skb) {
1883                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884                 } else {
1885                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1886                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889                 }
1890                 break;
1891         case 1:
1892                 if (skb) {
1893                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894                         struct flow_keys keys;
1895
1896                         /* short-circuit if we already have L4 hash present */
1897                         if (skb->l4_hash)
1898                                 return skb_get_hash_raw(skb) >> 1;
1899
1900                         memset(&hash_keys, 0, sizeof(hash_keys));
1901
1902                         if (!flkeys) {
1903                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1904                                 flkeys = &keys;
1905                         }
1906                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909                         hash_keys.ports.src = flkeys->ports.src;
1910                         hash_keys.ports.dst = flkeys->ports.dst;
1911                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912                 } else {
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1916                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917                         hash_keys.ports.src = fl6->fl6_sport;
1918                         hash_keys.ports.dst = fl6->fl6_dport;
1919                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920                 }
1921                 break;
1922         }
1923         mhash = flow_hash_from_keys(&hash_keys);
1924
1925         return mhash >> 1;
1926 }
1927
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930         const struct ipv6hdr *iph = ipv6_hdr(skb);
1931         struct net *net = dev_net(skb->dev);
1932         int flags = RT6_LOOKUP_F_HAS_SADDR;
1933         struct ip_tunnel_info *tun_info;
1934         struct flowi6 fl6 = {
1935                 .flowi6_iif = skb->dev->ifindex,
1936                 .daddr = iph->daddr,
1937                 .saddr = iph->saddr,
1938                 .flowlabel = ip6_flowinfo(iph),
1939                 .flowi6_mark = skb->mark,
1940                 .flowi6_proto = iph->nexthdr,
1941         };
1942         struct flow_keys *flkeys = NULL, _flkeys;
1943
1944         tun_info = skb_tunnel_info(skb);
1945         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947
1948         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949                 flkeys = &_flkeys;
1950
1951         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953         skb_dst_drop(skb);
1954         skb_dst_set(skb,
1955                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959                                              struct fib6_table *table,
1960                                              struct flowi6 *fl6,
1961                                              const struct sk_buff *skb,
1962                                              int flags)
1963 {
1964         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968                                          struct flowi6 *fl6, int flags)
1969 {
1970         bool any_src;
1971
1972         if (rt6_need_strict(&fl6->daddr)) {
1973                 struct dst_entry *dst;
1974
1975                 dst = l3mdev_link_scope_lookup(net, fl6);
1976                 if (dst)
1977                         return dst;
1978         }
1979
1980         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981
1982         any_src = ipv6_addr_any(&fl6->saddr);
1983         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984             (fl6->flowi6_oif && any_src))
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         if (!any_src)
1988                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1989         else if (sk)
1990                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991
1992         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999         struct net_device *loopback_dev = net->loopback_dev;
2000         struct dst_entry *new = NULL;
2001
2002         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003                        DST_OBSOLETE_DEAD, 0);
2004         if (rt) {
2005                 rt6_info_init(rt);
2006                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007
2008                 new = &rt->dst;
2009                 new->__use = 1;
2010                 new->input = dst_discard;
2011                 new->output = dst_discard_out;
2012
2013                 dst_copy_metrics(new, &ort->dst);
2014
2015                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2016                 rt->rt6i_gateway = ort->rt6i_gateway;
2017                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018                 rt->rt6i_metric = 0;
2019
2020                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024         }
2025
2026         dst_release(dst_orig);
2027         return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029
2030 /*
2031  *      Destination cache support functions
2032  */
2033
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036         if (rt->from &&
2037             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043         u32 rt_cookie = 0;
2044
2045         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046                 return NULL;
2047
2048         if (rt6_check_expired(rt))
2049                 return NULL;
2050
2051         return &rt->dst;
2052 }
2053
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056         if (!__rt6_check_expired(rt) &&
2057             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058             rt6_check(rt->from, cookie))
2059                 return &rt->dst;
2060         else
2061                 return NULL;
2062 }
2063
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066         struct rt6_info *rt;
2067
2068         rt = (struct rt6_info *) dst;
2069
2070         /* All IPV6 dsts are created with ->obsolete set to the value
2071          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072          * into this function always.
2073          */
2074
2075         rt6_dst_from_metrics_check(rt);
2076
2077         if (rt->rt6i_flags & RTF_PCPU ||
2078             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079                 return rt6_dst_from_check(rt, cookie);
2080         else
2081                 return rt6_check(rt, cookie);
2082 }
2083
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086         struct rt6_info *rt = (struct rt6_info *) dst;
2087
2088         if (rt) {
2089                 if (rt->rt6i_flags & RTF_CACHE) {
2090                         if (rt6_check_expired(rt)) {
2091                                 ip6_del_rt(rt);
2092                                 dst = NULL;
2093                         }
2094                 } else {
2095                         dst_release(dst);
2096                         dst = NULL;
2097                 }
2098         }
2099         return dst;
2100 }
2101
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104         struct rt6_info *rt;
2105
2106         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107
2108         rt = (struct rt6_info *) skb_dst(skb);
2109         if (rt) {
2110                 if (rt->rt6i_flags & RTF_CACHE) {
2111                         if (dst_hold_safe(&rt->dst))
2112                                 ip6_del_rt(rt);
2113                 } else {
2114                         struct fib6_node *fn;
2115
2116                         rcu_read_lock();
2117                         fn = rcu_dereference(rt->rt6i_node);
2118                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119                                 fn->fn_sernum = -1;
2120                         rcu_read_unlock();
2121                 }
2122         }
2123 }
2124
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127         struct net *net = dev_net(rt->dst.dev);
2128
2129         rt->rt6i_flags |= RTF_MODIFIED;
2130         rt->rt6i_pmtu = mtu;
2131         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136         return !(rt->rt6i_flags & RTF_CACHE) &&
2137                 (rt->rt6i_flags & RTF_PCPU ||
2138                  rcu_access_pointer(rt->rt6i_node));
2139 }
2140
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142                                  const struct ipv6hdr *iph, u32 mtu)
2143 {
2144         const struct in6_addr *daddr, *saddr;
2145         struct rt6_info *rt6 = (struct rt6_info *)dst;
2146
2147         if (rt6->rt6i_flags & RTF_LOCAL)
2148                 return;
2149
2150         if (dst_metric_locked(dst, RTAX_MTU))
2151                 return;
2152
2153         if (iph) {
2154                 daddr = &iph->daddr;
2155                 saddr = &iph->saddr;
2156         } else if (sk) {
2157                 daddr = &sk->sk_v6_daddr;
2158                 saddr = &inet6_sk(sk)->saddr;
2159         } else {
2160                 daddr = NULL;
2161                 saddr = NULL;
2162         }
2163         dst_confirm_neigh(dst, daddr);
2164         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165         if (mtu >= dst_mtu(dst))
2166                 return;
2167
2168         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169                 rt6_do_update_pmtu(rt6, mtu);
2170                 /* update rt6_ex->stamp for cache */
2171                 if (rt6->rt6i_flags & RTF_CACHE)
2172                         rt6_update_exception_stamp_rt(rt6);
2173         } else if (daddr) {
2174                 struct rt6_info *nrt6;
2175
2176                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177                 if (nrt6) {
2178                         rt6_do_update_pmtu(nrt6, mtu);
2179                         if (rt6_insert_exception(nrt6, rt6))
2180                                 dst_release_immediate(&nrt6->dst);
2181                 }
2182         }
2183 }
2184
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186                                struct sk_buff *skb, u32 mtu)
2187 {
2188         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192                      int oif, u32 mark, kuid_t uid)
2193 {
2194         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195         struct dst_entry *dst;
2196         struct flowi6 fl6;
2197
2198         memset(&fl6, 0, sizeof(fl6));
2199         fl6.flowi6_oif = oif;
2200         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201         fl6.daddr = iph->daddr;
2202         fl6.saddr = iph->saddr;
2203         fl6.flowlabel = ip6_flowinfo(iph);
2204         fl6.flowi6_uid = uid;
2205
2206         dst = ip6_route_output(net, NULL, &fl6);
2207         if (!dst->error)
2208                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209         dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215         struct dst_entry *dst;
2216
2217         ip6_update_pmtu(skb, sock_net(sk), mtu,
2218                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219
2220         dst = __sk_dst_get(sk);
2221         if (!dst || !dst->obsolete ||
2222             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223                 return;
2224
2225         bh_lock_sock(sk);
2226         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227                 ip6_datagram_dst_update(sk, false);
2228         bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231
2232 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2233                            const struct flowi6 *fl6)
2234 {
2235 #ifdef CONFIG_IPV6_SUBTREES
2236         struct ipv6_pinfo *np = inet6_sk(sk);
2237 #endif
2238
2239         ip6_dst_store(sk, dst,
2240                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2241                       &sk->sk_v6_daddr : NULL,
2242 #ifdef CONFIG_IPV6_SUBTREES
2243                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2244                       &np->saddr :
2245 #endif
2246                       NULL);
2247 }
2248
2249 /* Handle redirects */
2250 struct ip6rd_flowi {
2251         struct flowi6 fl6;
2252         struct in6_addr gateway;
2253 };
2254
2255 static struct rt6_info *__ip6_route_redirect(struct net *net,
2256                                              struct fib6_table *table,
2257                                              struct flowi6 *fl6,
2258                                              const struct sk_buff *skb,
2259                                              int flags)
2260 {
2261         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2262         struct rt6_info *rt, *rt_cache;
2263         struct fib6_node *fn;
2264
2265         /* Get the "current" route for this destination and
2266          * check if the redirect has come from appropriate router.
2267          *
2268          * RFC 4861 specifies that redirects should only be
2269          * accepted if they come from the nexthop to the target.
2270          * Due to the way the routes are chosen, this notion
2271          * is a bit fuzzy and one might need to check all possible
2272          * routes.
2273          */
2274
2275         rcu_read_lock();
2276         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2277 restart:
2278         for_each_fib6_node_rt_rcu(fn) {
2279                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2280                         continue;
2281                 if (rt6_check_expired(rt))
2282                         continue;
2283                 if (rt->dst.error)
2284                         break;
2285                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2286                         continue;
2287                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2288                         continue;
2289                 /* rt_cache's gateway might be different from its 'parent'
2290                  * in the case of an ip redirect.
2291                  * So we keep searching in the exception table if the gateway
2292                  * is different.
2293                  */
2294                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2295                         rt_cache = rt6_find_cached_rt(rt,
2296                                                       &fl6->daddr,
2297                                                       &fl6->saddr);
2298                         if (rt_cache &&
2299                             ipv6_addr_equal(&rdfl->gateway,
2300                                             &rt_cache->rt6i_gateway)) {
2301                                 rt = rt_cache;
2302                                 break;
2303                         }
2304                         continue;
2305                 }
2306                 break;
2307         }
2308
2309         if (!rt)
2310                 rt = net->ipv6.ip6_null_entry;
2311         else if (rt->dst.error) {
2312                 rt = net->ipv6.ip6_null_entry;
2313                 goto out;
2314         }
2315
2316         if (rt == net->ipv6.ip6_null_entry) {
2317                 fn = fib6_backtrack(fn, &fl6->saddr);
2318                 if (fn)
2319                         goto restart;
2320         }
2321
2322 out:
2323         ip6_hold_safe(net, &rt, true);
2324
2325         rcu_read_unlock();
2326
2327         trace_fib6_table_lookup(net, rt, table, fl6);
2328         return rt;
2329 };
2330
2331 static struct dst_entry *ip6_route_redirect(struct net *net,
2332                                             const struct flowi6 *fl6,
2333                                             const struct sk_buff *skb,
2334                                             const struct in6_addr *gateway)
2335 {
2336         int flags = RT6_LOOKUP_F_HAS_SADDR;
2337         struct ip6rd_flowi rdfl;
2338
2339         rdfl.fl6 = *fl6;
2340         rdfl.gateway = *gateway;
2341
2342         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2343                                 flags, __ip6_route_redirect);
2344 }
2345
2346 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2347                   kuid_t uid)
2348 {
2349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350         struct dst_entry *dst;
2351         struct flowi6 fl6;
2352
2353         memset(&fl6, 0, sizeof(fl6));
2354         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2355         fl6.flowi6_oif = oif;
2356         fl6.flowi6_mark = mark;
2357         fl6.daddr = iph->daddr;
2358         fl6.saddr = iph->saddr;
2359         fl6.flowlabel = ip6_flowinfo(iph);
2360         fl6.flowi6_uid = uid;
2361
2362         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2363         rt6_do_redirect(dst, NULL, skb);
2364         dst_release(dst);
2365 }
2366 EXPORT_SYMBOL_GPL(ip6_redirect);
2367
2368 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2369                             u32 mark)
2370 {
2371         const struct ipv6hdr *iph = ipv6_hdr(skb);
2372         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2373         struct dst_entry *dst;
2374         struct flowi6 fl6;
2375
2376         memset(&fl6, 0, sizeof(fl6));
2377         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2378         fl6.flowi6_oif = oif;
2379         fl6.flowi6_mark = mark;
2380         fl6.daddr = msg->dest;
2381         fl6.saddr = iph->daddr;
2382         fl6.flowi6_uid = sock_net_uid(net, NULL);
2383
2384         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2385         rt6_do_redirect(dst, NULL, skb);
2386         dst_release(dst);
2387 }
2388
2389 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2390 {
2391         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2392                      sk->sk_uid);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2395
2396 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2397 {
2398         struct net_device *dev = dst->dev;
2399         unsigned int mtu = dst_mtu(dst);
2400         struct net *net = dev_net(dev);
2401
2402         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2403
2404         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2405                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2406
2407         /*
2408          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2409          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2410          * IPV6_MAXPLEN is also valid and means: "any MSS,
2411          * rely only on pmtu discovery"
2412          */
2413         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2414                 mtu = IPV6_MAXPLEN;
2415         return mtu;
2416 }
2417
2418 static unsigned int ip6_mtu(const struct dst_entry *dst)
2419 {
2420         const struct rt6_info *rt = (const struct rt6_info *)dst;
2421         unsigned int mtu = rt->rt6i_pmtu;
2422         struct inet6_dev *idev;
2423
2424         if (mtu)
2425                 goto out;
2426
2427         mtu = dst_metric_raw(dst, RTAX_MTU);
2428         if (mtu)
2429                 goto out;
2430
2431         mtu = IPV6_MIN_MTU;
2432
2433         rcu_read_lock();
2434         idev = __in6_dev_get(dst->dev);
2435         if (idev)
2436                 mtu = idev->cnf.mtu6;
2437         rcu_read_unlock();
2438
2439 out:
2440         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2441
2442         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2443 }
2444
2445 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2446                                   struct flowi6 *fl6)
2447 {
2448         struct dst_entry *dst;
2449         struct rt6_info *rt;
2450         struct inet6_dev *idev = in6_dev_get(dev);
2451         struct net *net = dev_net(dev);
2452
2453         if (unlikely(!idev))
2454                 return ERR_PTR(-ENODEV);
2455
2456         rt = ip6_dst_alloc(net, dev, 0);
2457         if (unlikely(!rt)) {
2458                 in6_dev_put(idev);
2459                 dst = ERR_PTR(-ENOMEM);
2460                 goto out;
2461         }
2462
2463         rt->dst.flags |= DST_HOST;
2464         rt->dst.input = ip6_input;
2465         rt->dst.output  = ip6_output;
2466         rt->rt6i_gateway  = fl6->daddr;
2467         rt->rt6i_dst.addr = fl6->daddr;
2468         rt->rt6i_dst.plen = 128;
2469         rt->rt6i_idev     = idev;
2470         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2471
2472         /* Add this dst into uncached_list so that rt6_disable_ip() can
2473          * do proper release of the net_device
2474          */
2475         rt6_uncached_list_add(rt);
2476         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2477
2478         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2479
2480 out:
2481         return dst;
2482 }
2483
2484 static int ip6_dst_gc(struct dst_ops *ops)
2485 {
2486         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2487         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2488         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2489         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2490         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2491         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2492         int entries;
2493
2494         entries = dst_entries_get_fast(ops);
2495         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2496             entries <= rt_max_size)
2497                 goto out;
2498
2499         net->ipv6.ip6_rt_gc_expire++;
2500         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2501         entries = dst_entries_get_slow(ops);
2502         if (entries < ops->gc_thresh)
2503                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2504 out:
2505         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2506         return entries > rt_max_size;
2507 }
2508
2509 static int ip6_convert_metrics(struct mx6_config *mxc,
2510                                const struct fib6_config *cfg)
2511 {
2512         struct net *net = cfg->fc_nlinfo.nl_net;
2513         bool ecn_ca = false;
2514         struct nlattr *nla;
2515         int remaining;
2516         u32 *mp;
2517
2518         if (!cfg->fc_mx)
2519                 return 0;
2520
2521         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2522         if (unlikely(!mp))
2523                 return -ENOMEM;
2524
2525         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2526                 int type = nla_type(nla);
2527                 u32 val;
2528
2529                 if (!type)
2530                         continue;
2531                 if (unlikely(type > RTAX_MAX))
2532                         goto err;
2533
2534                 if (type == RTAX_CC_ALGO) {
2535                         char tmp[TCP_CA_NAME_MAX];
2536
2537                         nla_strlcpy(tmp, nla, sizeof(tmp));
2538                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2539                         if (val == TCP_CA_UNSPEC)
2540                                 goto err;
2541                 } else {
2542                         val = nla_get_u32(nla);
2543                 }
2544                 if (type == RTAX_HOPLIMIT && val > 255)
2545                         val = 255;
2546                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2547                         goto err;
2548
2549                 mp[type - 1] = val;
2550                 __set_bit(type - 1, mxc->mx_valid);
2551         }
2552
2553         if (ecn_ca) {
2554                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2555                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2556         }
2557
2558         mxc->mx = mp;
2559         return 0;
2560  err:
2561         kfree(mp);
2562         return -EINVAL;
2563 }
2564
2565 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2566                                             struct fib6_config *cfg,
2567                                             const struct in6_addr *gw_addr,
2568                                             u32 tbid, int flags)
2569 {
2570         struct flowi6 fl6 = {
2571                 .flowi6_oif = cfg->fc_ifindex,
2572                 .daddr = *gw_addr,
2573                 .saddr = cfg->fc_prefsrc,
2574         };
2575         struct fib6_table *table;
2576         struct rt6_info *rt;
2577
2578         table = fib6_get_table(net, tbid);
2579         if (!table)
2580                 return NULL;
2581
2582         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2583                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2584
2585         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2586         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2587
2588         /* if table lookup failed, fall back to full lookup */
2589         if (rt == net->ipv6.ip6_null_entry) {
2590                 ip6_rt_put(rt);
2591                 rt = NULL;
2592         }
2593
2594         return rt;
2595 }
2596
2597 static int ip6_route_check_nh_onlink(struct net *net,
2598                                      struct fib6_config *cfg,
2599                                      const struct net_device *dev,
2600                                      struct netlink_ext_ack *extack)
2601 {
2602         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2603         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2604         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2605         struct rt6_info *grt;
2606         int err;
2607
2608         err = 0;
2609         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2610         if (grt) {
2611                 if (!grt->dst.error &&
2612                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2613                         NL_SET_ERR_MSG(extack,
2614                                        "Nexthop has invalid gateway or device mismatch");
2615                         err = -EINVAL;
2616                 }
2617
2618                 ip6_rt_put(grt);
2619         }
2620
2621         return err;
2622 }
2623
2624 static int ip6_route_check_nh(struct net *net,
2625                               struct fib6_config *cfg,
2626                               struct net_device **_dev,
2627                               struct inet6_dev **idev)
2628 {
2629         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2630         struct net_device *dev = _dev ? *_dev : NULL;
2631         struct rt6_info *grt = NULL;
2632         int err = -EHOSTUNREACH;
2633
2634         if (cfg->fc_table) {
2635                 int flags = RT6_LOOKUP_F_IFACE;
2636
2637                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2638                                           cfg->fc_table, flags);
2639                 if (grt) {
2640                         if (grt->rt6i_flags & RTF_GATEWAY ||
2641                             (dev && dev != grt->dst.dev)) {
2642                                 ip6_rt_put(grt);
2643                                 grt = NULL;
2644                         }
2645                 }
2646         }
2647
2648         if (!grt)
2649                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2650
2651         if (!grt)
2652                 goto out;
2653
2654         if (dev) {
2655                 if (dev != grt->dst.dev) {
2656                         ip6_rt_put(grt);
2657                         goto out;
2658                 }
2659         } else {
2660                 *_dev = dev = grt->dst.dev;
2661                 *idev = grt->rt6i_idev;
2662                 dev_hold(dev);
2663                 in6_dev_hold(grt->rt6i_idev);
2664         }
2665
2666         if (!(grt->rt6i_flags & RTF_GATEWAY))
2667                 err = 0;
2668
2669         ip6_rt_put(grt);
2670
2671 out:
2672         return err;
2673 }
2674
2675 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2676                            struct net_device **_dev, struct inet6_dev **idev,
2677                            struct netlink_ext_ack *extack)
2678 {
2679         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2680         int gwa_type = ipv6_addr_type(gw_addr);
2681         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2682         const struct net_device *dev = *_dev;
2683         bool need_addr_check = !dev;
2684         int err = -EINVAL;
2685
2686         /* if gw_addr is local we will fail to detect this in case
2687          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2688          * will return already-added prefix route via interface that
2689          * prefix route was assigned to, which might be non-loopback.
2690          */
2691         if (dev &&
2692             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2693                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2694                 goto out;
2695         }
2696
2697         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2698                 /* IPv6 strictly inhibits using not link-local
2699                  * addresses as nexthop address.
2700                  * Otherwise, router will not able to send redirects.
2701                  * It is very good, but in some (rare!) circumstances
2702                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2703                  * some exceptions. --ANK
2704                  * We allow IPv4-mapped nexthops to support RFC4798-type
2705                  * addressing
2706                  */
2707                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2708                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2709                         goto out;
2710                 }
2711
2712                 if (cfg->fc_flags & RTNH_F_ONLINK)
2713                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2714                 else
2715                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2716
2717                 if (err)
2718                         goto out;
2719         }
2720
2721         /* reload in case device was changed */
2722         dev = *_dev;
2723
2724         err = -EINVAL;
2725         if (!dev) {
2726                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2727                 goto out;
2728         } else if (dev->flags & IFF_LOOPBACK) {
2729                 NL_SET_ERR_MSG(extack,
2730                                "Egress device can not be loopback device for this route");
2731                 goto out;
2732         }
2733
2734         /* if we did not check gw_addr above, do so now that the
2735          * egress device has been resolved.
2736          */
2737         if (need_addr_check &&
2738             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2739                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2740                 goto out;
2741         }
2742
2743         err = 0;
2744 out:
2745         return err;
2746 }
2747
2748 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2749                                               struct netlink_ext_ack *extack)
2750 {
2751         struct net *net = cfg->fc_nlinfo.nl_net;
2752         struct rt6_info *rt = NULL;
2753         struct net_device *dev = NULL;
2754         struct inet6_dev *idev = NULL;
2755         struct fib6_table *table;
2756         int addr_type;
2757         int err = -EINVAL;
2758
2759         /* RTF_PCPU is an internal flag; can not be set by userspace */
2760         if (cfg->fc_flags & RTF_PCPU) {
2761                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2762                 goto out;
2763         }
2764
2765         /* RTF_CACHE is an internal flag; can not be set by userspace */
2766         if (cfg->fc_flags & RTF_CACHE) {
2767                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2768                 goto out;
2769         }
2770
2771         if (cfg->fc_dst_len > 128) {
2772                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2773                 goto out;
2774         }
2775         if (cfg->fc_src_len > 128) {
2776                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2777                 goto out;
2778         }
2779 #ifndef CONFIG_IPV6_SUBTREES
2780         if (cfg->fc_src_len) {
2781                 NL_SET_ERR_MSG(extack,
2782                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2783                 goto out;
2784         }
2785 #endif
2786         if (cfg->fc_ifindex) {
2787                 err = -ENODEV;
2788                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2789                 if (!dev)
2790                         goto out;
2791                 idev = in6_dev_get(dev);
2792                 if (!idev)
2793                         goto out;
2794         }
2795
2796         if (cfg->fc_metric == 0)
2797                 cfg->fc_metric = IP6_RT_PRIO_USER;
2798
2799         if (cfg->fc_flags & RTNH_F_ONLINK) {
2800                 if (!dev) {
2801                         NL_SET_ERR_MSG(extack,
2802                                        "Nexthop device required for onlink");
2803                         err = -ENODEV;
2804                         goto out;
2805                 }
2806
2807                 if (!(dev->flags & IFF_UP)) {
2808                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2809                         err = -ENETDOWN;
2810                         goto out;
2811                 }
2812         }
2813
2814         err = -ENOBUFS;
2815         if (cfg->fc_nlinfo.nlh &&
2816             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2817                 table = fib6_get_table(net, cfg->fc_table);
2818                 if (!table) {
2819                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2820                         table = fib6_new_table(net, cfg->fc_table);
2821                 }
2822         } else {
2823                 table = fib6_new_table(net, cfg->fc_table);
2824         }
2825
2826         if (!table)
2827                 goto out;
2828
2829         rt = ip6_dst_alloc(net, NULL,
2830                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2831
2832         if (!rt) {
2833                 err = -ENOMEM;
2834                 goto out;
2835         }
2836
2837         if (cfg->fc_flags & RTF_EXPIRES)
2838                 rt6_set_expires(rt, jiffies +
2839                                 clock_t_to_jiffies(cfg->fc_expires));
2840         else
2841                 rt6_clean_expires(rt);
2842
2843         if (cfg->fc_protocol == RTPROT_UNSPEC)
2844                 cfg->fc_protocol = RTPROT_BOOT;
2845         rt->rt6i_protocol = cfg->fc_protocol;
2846
2847         addr_type = ipv6_addr_type(&cfg->fc_dst);
2848
2849         if (addr_type & IPV6_ADDR_MULTICAST)
2850                 rt->dst.input = ip6_mc_input;
2851         else if (cfg->fc_flags & RTF_LOCAL)
2852                 rt->dst.input = ip6_input;
2853         else
2854                 rt->dst.input = ip6_forward;
2855
2856         rt->dst.output = ip6_output;
2857
2858         if (cfg->fc_encap) {
2859                 struct lwtunnel_state *lwtstate;
2860
2861                 err = lwtunnel_build_state(cfg->fc_encap_type,
2862                                            cfg->fc_encap, AF_INET6, cfg,
2863                                            &lwtstate, extack);
2864                 if (err)
2865                         goto out;
2866                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2867                 lwtunnel_set_redirect(&rt->dst);
2868         }
2869
2870         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2871         rt->rt6i_dst.plen = cfg->fc_dst_len;
2872         if (rt->rt6i_dst.plen == 128)
2873                 rt->dst.flags |= DST_HOST;
2874
2875 #ifdef CONFIG_IPV6_SUBTREES
2876         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2877         rt->rt6i_src.plen = cfg->fc_src_len;
2878 #endif
2879
2880         rt->rt6i_metric = cfg->fc_metric;
2881         rt->rt6i_nh_weight = 1;
2882
2883         /* We cannot add true routes via loopback here,
2884            they would result in kernel looping; promote them to reject routes
2885          */
2886         if ((cfg->fc_flags & RTF_REJECT) ||
2887             (dev && (dev->flags & IFF_LOOPBACK) &&
2888              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2889              !(cfg->fc_flags & RTF_LOCAL))) {
2890                 /* hold loopback dev/idev if we haven't done so. */
2891                 if (dev != net->loopback_dev) {
2892                         if (dev) {
2893                                 dev_put(dev);
2894                                 in6_dev_put(idev);
2895                         }
2896                         dev = net->loopback_dev;
2897                         dev_hold(dev);
2898                         idev = in6_dev_get(dev);
2899                         if (!idev) {
2900                                 err = -ENODEV;
2901                                 goto out;
2902                         }
2903                 }
2904                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2905                 switch (cfg->fc_type) {
2906                 case RTN_BLACKHOLE:
2907                         rt->dst.error = -EINVAL;
2908                         rt->dst.output = dst_discard_out;
2909                         rt->dst.input = dst_discard;
2910                         break;
2911                 case RTN_PROHIBIT:
2912                         rt->dst.error = -EACCES;
2913                         rt->dst.output = ip6_pkt_prohibit_out;
2914                         rt->dst.input = ip6_pkt_prohibit;
2915                         break;
2916                 case RTN_THROW:
2917                 case RTN_UNREACHABLE:
2918                 default:
2919                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2920                                         : (cfg->fc_type == RTN_UNREACHABLE)
2921                                         ? -EHOSTUNREACH : -ENETUNREACH;
2922                         rt->dst.output = ip6_pkt_discard_out;
2923                         rt->dst.input = ip6_pkt_discard;
2924                         break;
2925                 }
2926                 goto install_route;
2927         }
2928
2929         if (cfg->fc_flags & RTF_GATEWAY) {
2930                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2931                 if (err)
2932                         goto out;
2933
2934                 rt->rt6i_gateway = cfg->fc_gateway;
2935         }
2936
2937         err = -ENODEV;
2938         if (!dev)
2939                 goto out;
2940
2941         if (idev->cnf.disable_ipv6) {
2942                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2943                 err = -EACCES;
2944                 goto out;
2945         }
2946
2947         if (!(dev->flags & IFF_UP)) {
2948                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2949                 err = -ENETDOWN;
2950                 goto out;
2951         }
2952
2953         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2954                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2955                         NL_SET_ERR_MSG(extack, "Invalid source address");
2956                         err = -EINVAL;
2957                         goto out;
2958                 }
2959                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2960                 rt->rt6i_prefsrc.plen = 128;
2961         } else
2962                 rt->rt6i_prefsrc.plen = 0;
2963
2964         rt->rt6i_flags = cfg->fc_flags;
2965
2966 install_route:
2967         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2968             !netif_carrier_ok(dev))
2969                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2970         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2971         rt->dst.dev = dev;
2972         rt->rt6i_idev = idev;
2973         rt->rt6i_table = table;
2974
2975         cfg->fc_nlinfo.nl_net = dev_net(dev);
2976
2977         return rt;
2978 out:
2979         if (dev)
2980                 dev_put(dev);
2981         if (idev)
2982                 in6_dev_put(idev);
2983         if (rt)
2984                 dst_release_immediate(&rt->dst);
2985
2986         return ERR_PTR(err);
2987 }
2988
2989 int ip6_route_add(struct fib6_config *cfg,
2990                   struct netlink_ext_ack *extack)
2991 {
2992         struct mx6_config mxc = { .mx = NULL, };
2993         struct rt6_info *rt;
2994         int err;
2995
2996         rt = ip6_route_info_create(cfg, extack);
2997         if (IS_ERR(rt)) {
2998                 err = PTR_ERR(rt);
2999                 rt = NULL;
3000                 goto out;
3001         }
3002
3003         err = ip6_convert_metrics(&mxc, cfg);
3004         if (err)
3005                 goto out;
3006
3007         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3008
3009         kfree(mxc.mx);
3010
3011         return err;
3012 out:
3013         if (rt)
3014                 dst_release_immediate(&rt->dst);
3015
3016         return err;
3017 }
3018
3019 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3020 {
3021         int err;
3022         struct fib6_table *table;
3023         struct net *net = dev_net(rt->dst.dev);
3024
3025         if (rt == net->ipv6.ip6_null_entry) {
3026                 err = -ENOENT;
3027                 goto out;
3028         }
3029
3030         table = rt->rt6i_table;
3031         spin_lock_bh(&table->tb6_lock);
3032         err = fib6_del(rt, info);
3033         spin_unlock_bh(&table->tb6_lock);
3034
3035 out:
3036         ip6_rt_put(rt);
3037         return err;
3038 }
3039
3040 int ip6_del_rt(struct rt6_info *rt)
3041 {
3042         struct nl_info info = {
3043                 .nl_net = dev_net(rt->dst.dev),
3044         };
3045         return __ip6_del_rt(rt, &info);
3046 }
3047
3048 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3049 {
3050         struct nl_info *info = &cfg->fc_nlinfo;
3051         struct net *net = info->nl_net;
3052         struct sk_buff *skb = NULL;
3053         struct fib6_table *table;
3054         int err = -ENOENT;
3055
3056         if (rt == net->ipv6.ip6_null_entry)
3057                 goto out_put;
3058         table = rt->rt6i_table;
3059         spin_lock_bh(&table->tb6_lock);
3060
3061         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3062                 struct rt6_info *sibling, *next_sibling;
3063
3064                 /* prefer to send a single notification with all hops */
3065                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3066                 if (skb) {
3067                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3068
3069                         if (rt6_fill_node(net, skb, rt,
3070                                           NULL, NULL, 0, RTM_DELROUTE,
3071                                           info->portid, seq, 0) < 0) {
3072                                 kfree_skb(skb);
3073                                 skb = NULL;
3074                         } else
3075                                 info->skip_notify = 1;
3076                 }
3077
3078                 list_for_each_entry_safe(sibling, next_sibling,
3079                                          &rt->rt6i_siblings,
3080                                          rt6i_siblings) {
3081                         err = fib6_del(sibling, info);
3082                         if (err)
3083                                 goto out_unlock;
3084                 }
3085         }
3086
3087         err = fib6_del(rt, info);
3088 out_unlock:
3089         spin_unlock_bh(&table->tb6_lock);
3090 out_put:
3091         ip6_rt_put(rt);
3092
3093         if (skb) {
3094                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3095                             info->nlh, gfp_any());
3096         }
3097         return err;
3098 }
3099
3100 static int ip6_route_del(struct fib6_config *cfg,
3101                          struct netlink_ext_ack *extack)
3102 {
3103         struct rt6_info *rt, *rt_cache;
3104         struct fib6_table *table;
3105         struct fib6_node *fn;
3106         int err = -ESRCH;
3107
3108         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3109         if (!table) {
3110                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3111                 return err;
3112         }
3113
3114         rcu_read_lock();
3115
3116         fn = fib6_locate(&table->tb6_root,
3117                          &cfg->fc_dst, cfg->fc_dst_len,
3118                          &cfg->fc_src, cfg->fc_src_len,
3119                          !(cfg->fc_flags & RTF_CACHE));
3120
3121         if (fn) {
3122                 for_each_fib6_node_rt_rcu(fn) {
3123                         if (cfg->fc_flags & RTF_CACHE) {
3124                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3125                                                               &cfg->fc_src);
3126                                 if (!rt_cache)
3127                                         continue;
3128                                 rt = rt_cache;
3129                         }
3130                         if (cfg->fc_ifindex &&
3131                             (!rt->dst.dev ||
3132                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3133                                 continue;
3134                         if (cfg->fc_flags & RTF_GATEWAY &&
3135                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3136                                 continue;
3137                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3138                                 continue;
3139                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3140                                 continue;
3141                         if (!dst_hold_safe(&rt->dst))
3142                                 break;
3143                         rcu_read_unlock();
3144
3145                         /* if gateway was specified only delete the one hop */
3146                         if (cfg->fc_flags & RTF_GATEWAY)
3147                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3148
3149                         return __ip6_del_rt_siblings(rt, cfg);
3150                 }
3151         }
3152         rcu_read_unlock();
3153
3154         return err;
3155 }
3156
3157 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3158 {
3159         struct netevent_redirect netevent;
3160         struct rt6_info *rt, *nrt = NULL;
3161         struct ndisc_options ndopts;
3162         struct inet6_dev *in6_dev;
3163         struct neighbour *neigh;
3164         struct rd_msg *msg;
3165         int optlen, on_link;
3166         u8 *lladdr;
3167
3168         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3169         optlen -= sizeof(*msg);
3170
3171         if (optlen < 0) {
3172                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3173                 return;
3174         }
3175
3176         msg = (struct rd_msg *)icmp6_hdr(skb);
3177
3178         if (ipv6_addr_is_multicast(&msg->dest)) {
3179                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3180                 return;
3181         }
3182
3183         on_link = 0;
3184         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3185                 on_link = 1;
3186         } else if (ipv6_addr_type(&msg->target) !=
3187                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3188                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3189                 return;
3190         }
3191
3192         in6_dev = __in6_dev_get(skb->dev);
3193         if (!in6_dev)
3194                 return;
3195         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3196                 return;
3197
3198         /* RFC2461 8.1:
3199          *      The IP source address of the Redirect MUST be the same as the current
3200          *      first-hop router for the specified ICMP Destination Address.
3201          */
3202
3203         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3204                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3205                 return;
3206         }
3207
3208         lladdr = NULL;
3209         if (ndopts.nd_opts_tgt_lladdr) {
3210                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3211                                              skb->dev);
3212                 if (!lladdr) {
3213                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3214                         return;
3215                 }
3216         }
3217
3218         rt = (struct rt6_info *) dst;
3219         if (rt->rt6i_flags & RTF_REJECT) {
3220                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3221                 return;
3222         }
3223
3224         /* Redirect received -> path was valid.
3225          * Look, redirects are sent only in response to data packets,
3226          * so that this nexthop apparently is reachable. --ANK
3227          */
3228         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3229
3230         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3231         if (!neigh)
3232                 return;
3233
3234         /*
3235          *      We have finally decided to accept it.
3236          */
3237
3238         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3239                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3240                      NEIGH_UPDATE_F_OVERRIDE|
3241                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3242                                      NEIGH_UPDATE_F_ISROUTER)),
3243                      NDISC_REDIRECT, &ndopts);
3244
3245         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3246         if (!nrt)
3247                 goto out;
3248
3249         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3250         if (on_link)
3251                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3252
3253         nrt->rt6i_protocol = RTPROT_REDIRECT;
3254         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3255
3256         /* No need to remove rt from the exception table if rt is
3257          * a cached route because rt6_insert_exception() will
3258          * takes care of it
3259          */
3260         if (rt6_insert_exception(nrt, rt)) {
3261                 dst_release_immediate(&nrt->dst);
3262                 goto out;
3263         }
3264
3265         netevent.old = &rt->dst;
3266         netevent.new = &nrt->dst;
3267         netevent.daddr = &msg->dest;
3268         netevent.neigh = neigh;
3269         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3270
3271 out:
3272         neigh_release(neigh);
3273 }
3274
3275 /*
3276  *      Misc support functions
3277  */
3278
3279 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3280 {
3281         BUG_ON(from->from);
3282
3283         rt->rt6i_flags &= ~RTF_EXPIRES;
3284         dst_hold(&from->dst);
3285         rt->from = from;
3286         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3287 }
3288
3289 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3290 {
3291         rt->dst.input = ort->dst.input;
3292         rt->dst.output = ort->dst.output;
3293         rt->rt6i_dst = ort->rt6i_dst;
3294         rt->dst.error = ort->dst.error;
3295         rt->rt6i_idev = ort->rt6i_idev;
3296         if (rt->rt6i_idev)
3297                 in6_dev_hold(rt->rt6i_idev);
3298         rt->dst.lastuse = jiffies;
3299         rt->rt6i_gateway = ort->rt6i_gateway;
3300         rt->rt6i_flags = ort->rt6i_flags;
3301         rt6_set_from(rt, ort);
3302         rt->rt6i_metric = ort->rt6i_metric;
3303 #ifdef CONFIG_IPV6_SUBTREES
3304         rt->rt6i_src = ort->rt6i_src;
3305 #endif
3306         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3307         rt->rt6i_table = ort->rt6i_table;
3308         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3309 }
3310
3311 #ifdef CONFIG_IPV6_ROUTE_INFO
3312 static struct rt6_info *rt6_get_route_info(struct net *net,
3313                                            const struct in6_addr *prefix, int prefixlen,
3314                                            const struct in6_addr *gwaddr,
3315                                            struct net_device *dev)
3316 {
3317         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3318         int ifindex = dev->ifindex;
3319         struct fib6_node *fn;
3320         struct rt6_info *rt = NULL;
3321         struct fib6_table *table;
3322
3323         table = fib6_get_table(net, tb_id);
3324         if (!table)
3325                 return NULL;
3326
3327         rcu_read_lock();
3328         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3329         if (!fn)
3330                 goto out;
3331
3332         for_each_fib6_node_rt_rcu(fn) {
3333                 if (rt->dst.dev->ifindex != ifindex)
3334                         continue;
3335                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3336                         continue;
3337                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3338                         continue;
3339                 ip6_hold_safe(NULL, &rt, false);
3340                 break;
3341         }
3342 out:
3343         rcu_read_unlock();
3344         return rt;
3345 }
3346
3347 static struct rt6_info *rt6_add_route_info(struct net *net,
3348                                            const struct in6_addr *prefix, int prefixlen,
3349                                            const struct in6_addr *gwaddr,
3350                                            struct net_device *dev,
3351                                            unsigned int pref)
3352 {
3353         struct fib6_config cfg = {
3354                 .fc_metric      = IP6_RT_PRIO_USER,
3355                 .fc_ifindex     = dev->ifindex,
3356                 .fc_dst_len     = prefixlen,
3357                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3358                                   RTF_UP | RTF_PREF(pref),
3359                 .fc_protocol = RTPROT_RA,
3360                 .fc_nlinfo.portid = 0,
3361                 .fc_nlinfo.nlh = NULL,
3362                 .fc_nlinfo.nl_net = net,
3363         };
3364
3365         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3366         cfg.fc_dst = *prefix;
3367         cfg.fc_gateway = *gwaddr;
3368
3369         /* We should treat it as a default route if prefix length is 0. */
3370         if (!prefixlen)
3371                 cfg.fc_flags |= RTF_DEFAULT;
3372
3373         ip6_route_add(&cfg, NULL);
3374
3375         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3376 }
3377 #endif
3378
3379 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3380 {
3381         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3382         struct rt6_info *rt;
3383         struct fib6_table *table;
3384
3385         table = fib6_get_table(dev_net(dev), tb_id);
3386         if (!table)
3387                 return NULL;
3388
3389         rcu_read_lock();
3390         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3391                 if (dev == rt->dst.dev &&
3392                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3393                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3394                         break;
3395         }
3396         if (rt)
3397                 ip6_hold_safe(NULL, &rt, false);
3398         rcu_read_unlock();
3399         return rt;
3400 }
3401
3402 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3403                                      struct net_device *dev,
3404                                      unsigned int pref)
3405 {
3406         struct fib6_config cfg = {
3407                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3408                 .fc_metric      = IP6_RT_PRIO_USER,
3409                 .fc_ifindex     = dev->ifindex,
3410                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3411                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3412                 .fc_protocol = RTPROT_RA,
3413                 .fc_nlinfo.portid = 0,
3414                 .fc_nlinfo.nlh = NULL,
3415                 .fc_nlinfo.nl_net = dev_net(dev),
3416         };
3417
3418         cfg.fc_gateway = *gwaddr;
3419
3420         if (!ip6_route_add(&cfg, NULL)) {
3421                 struct fib6_table *table;
3422
3423                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3424                 if (table)
3425                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3426         }
3427
3428         return rt6_get_dflt_router(gwaddr, dev);
3429 }
3430
3431 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3432 {
3433         struct rt6_info *rt;
3434
3435 restart:
3436         rcu_read_lock();
3437         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3438                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3439                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3440                         if (dst_hold_safe(&rt->dst)) {
3441                                 rcu_read_unlock();
3442                                 ip6_del_rt(rt);
3443                         } else {
3444                                 rcu_read_unlock();
3445                         }
3446                         goto restart;
3447                 }
3448         }
3449         rcu_read_unlock();
3450
3451         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3452 }
3453
3454 void rt6_purge_dflt_routers(struct net *net)
3455 {
3456         struct fib6_table *table;
3457         struct hlist_head *head;
3458         unsigned int h;
3459
3460         rcu_read_lock();
3461
3462         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3463                 head = &net->ipv6.fib_table_hash[h];
3464                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3465                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3466                                 __rt6_purge_dflt_routers(table);
3467                 }
3468         }
3469
3470         rcu_read_unlock();
3471 }
3472
3473 static void rtmsg_to_fib6_config(struct net *net,
3474                                  struct in6_rtmsg *rtmsg,
3475                                  struct fib6_config *cfg)
3476 {
3477         memset(cfg, 0, sizeof(*cfg));
3478
3479         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3480                          : RT6_TABLE_MAIN;
3481         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3482         cfg->fc_metric = rtmsg->rtmsg_metric;
3483         cfg->fc_expires = rtmsg->rtmsg_info;
3484         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3485         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3486         cfg->fc_flags = rtmsg->rtmsg_flags;
3487
3488         cfg->fc_nlinfo.nl_net = net;
3489
3490         cfg->fc_dst = rtmsg->rtmsg_dst;
3491         cfg->fc_src = rtmsg->rtmsg_src;
3492         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3493 }
3494
3495 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3496 {
3497         struct fib6_config cfg;
3498         struct in6_rtmsg rtmsg;
3499         int err;
3500
3501         switch (cmd) {
3502         case SIOCADDRT:         /* Add a route */
3503         case SIOCDELRT:         /* Delete a route */
3504                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3505                         return -EPERM;
3506                 err = copy_from_user(&rtmsg, arg,
3507                                      sizeof(struct in6_rtmsg));
3508                 if (err)
3509                         return -EFAULT;
3510
3511                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3512
3513                 rtnl_lock();
3514                 switch (cmd) {
3515                 case SIOCADDRT:
3516                         err = ip6_route_add(&cfg, NULL);
3517                         break;
3518                 case SIOCDELRT:
3519                         err = ip6_route_del(&cfg, NULL);
3520                         break;
3521                 default:
3522                         err = -EINVAL;
3523                 }
3524                 rtnl_unlock();
3525
3526                 return err;
3527         }
3528
3529         return -EINVAL;
3530 }
3531
3532 /*
3533  *      Drop the packet on the floor
3534  */
3535
3536 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3537 {
3538         int type;
3539         struct dst_entry *dst = skb_dst(skb);
3540         switch (ipstats_mib_noroutes) {
3541         case IPSTATS_MIB_INNOROUTES:
3542                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3543                 if (type == IPV6_ADDR_ANY) {
3544                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3545                                       IPSTATS_MIB_INADDRERRORS);
3546                         break;
3547                 }
3548                 /* FALLTHROUGH */
3549         case IPSTATS_MIB_OUTNOROUTES:
3550                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3551                               ipstats_mib_noroutes);
3552                 break;
3553         }
3554         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3555         kfree_skb(skb);
3556         return 0;
3557 }
3558
3559 static int ip6_pkt_discard(struct sk_buff *skb)
3560 {
3561         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3562 }
3563
3564 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3565 {
3566         skb->dev = skb_dst(skb)->dev;
3567         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3568 }
3569
3570 static int ip6_pkt_prohibit(struct sk_buff *skb)
3571 {
3572         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3573 }
3574
3575 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3576 {
3577         skb->dev = skb_dst(skb)->dev;
3578         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3579 }
3580
3581 /*
3582  *      Allocate a dst for local (unicast / anycast) address.
3583  */
3584
3585 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3586                                     const struct in6_addr *addr,
3587                                     bool anycast)
3588 {
3589         u32 tb_id;
3590         struct net *net = dev_net(idev->dev);
3591         struct net_device *dev = idev->dev;
3592         struct rt6_info *rt;
3593
3594         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3595         if (!rt)
3596                 return ERR_PTR(-ENOMEM);
3597
3598         in6_dev_hold(idev);
3599
3600         rt->dst.flags |= DST_HOST;
3601         rt->dst.input = ip6_input;
3602         rt->dst.output = ip6_output;
3603         rt->rt6i_idev = idev;
3604
3605         rt->rt6i_protocol = RTPROT_KERNEL;
3606         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3607         if (anycast)
3608                 rt->rt6i_flags |= RTF_ANYCAST;
3609         else
3610                 rt->rt6i_flags |= RTF_LOCAL;
3611
3612         rt->rt6i_gateway  = *addr;
3613         rt->rt6i_dst.addr = *addr;
3614         rt->rt6i_dst.plen = 128;
3615         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3616         rt->rt6i_table = fib6_get_table(net, tb_id);
3617
3618         return rt;
3619 }
3620
3621 /* remove deleted ip from prefsrc entries */
3622 struct arg_dev_net_ip {
3623         struct net_device *dev;
3624         struct net *net;
3625         struct in6_addr *addr;
3626 };
3627
3628 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3629 {
3630         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3631         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3632         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3633
3634         if (((void *)rt->dst.dev == dev || !dev) &&
3635             rt != net->ipv6.ip6_null_entry &&
3636             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3637                 spin_lock_bh(&rt6_exception_lock);
3638                 /* remove prefsrc entry */
3639                 rt->rt6i_prefsrc.plen = 0;
3640                 /* need to update cache as well */
3641                 rt6_exceptions_remove_prefsrc(rt);
3642                 spin_unlock_bh(&rt6_exception_lock);
3643         }
3644         return 0;
3645 }
3646
3647 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3648 {
3649         struct net *net = dev_net(ifp->idev->dev);
3650         struct arg_dev_net_ip adni = {
3651                 .dev = ifp->idev->dev,
3652                 .net = net,
3653                 .addr = &ifp->addr,
3654         };
3655         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3656 }
3657
3658 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3659
3660 /* Remove routers and update dst entries when gateway turn into host. */
3661 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3662 {
3663         struct in6_addr *gateway = (struct in6_addr *)arg;
3664
3665         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3666             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3667                 return -1;
3668         }
3669
3670         /* Further clean up cached routes in exception table.
3671          * This is needed because cached route may have a different
3672          * gateway than its 'parent' in the case of an ip redirect.
3673          */
3674         rt6_exceptions_clean_tohost(rt, gateway);
3675
3676         return 0;
3677 }
3678
3679 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3680 {
3681         fib6_clean_all(net, fib6_clean_tohost, gateway);
3682 }
3683
3684 struct arg_netdev_event {
3685         const struct net_device *dev;
3686         union {
3687                 unsigned int nh_flags;
3688                 unsigned long event;
3689         };
3690 };
3691
3692 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3693 {
3694         struct rt6_info *iter;
3695         struct fib6_node *fn;
3696
3697         fn = rcu_dereference_protected(rt->rt6i_node,
3698                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3699         iter = rcu_dereference_protected(fn->leaf,
3700                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3701         while (iter) {
3702                 if (iter->rt6i_metric == rt->rt6i_metric &&
3703                     rt6_qualify_for_ecmp(iter))
3704                         return iter;
3705                 iter = rcu_dereference_protected(iter->rt6_next,
3706                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3707         }
3708
3709         return NULL;
3710 }
3711
3712 static bool rt6_is_dead(const struct rt6_info *rt)
3713 {
3714         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3715             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3716              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3717                 return true;
3718
3719         return false;
3720 }
3721
3722 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3723 {
3724         struct rt6_info *iter;
3725         int total = 0;
3726
3727         if (!rt6_is_dead(rt))
3728                 total += rt->rt6i_nh_weight;
3729
3730         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3731                 if (!rt6_is_dead(iter))
3732                         total += iter->rt6i_nh_weight;
3733         }
3734
3735         return total;
3736 }
3737
3738 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3739 {
3740         int upper_bound = -1;
3741
3742         if (!rt6_is_dead(rt)) {
3743                 *weight += rt->rt6i_nh_weight;
3744                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3745                                                     total) - 1;
3746         }
3747         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3748 }
3749
3750 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3751 {
3752         struct rt6_info *iter;
3753         int weight = 0;
3754
3755         rt6_upper_bound_set(rt, &weight, total);
3756
3757         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3758                 rt6_upper_bound_set(iter, &weight, total);
3759 }
3760
3761 void rt6_multipath_rebalance(struct rt6_info *rt)
3762 {
3763         struct rt6_info *first;
3764         int total;
3765
3766         /* In case the entire multipath route was marked for flushing,
3767          * then there is no need to rebalance upon the removal of every
3768          * sibling route.
3769          */
3770         if (!rt->rt6i_nsiblings || rt->should_flush)
3771                 return;
3772
3773         /* During lookup routes are evaluated in order, so we need to
3774          * make sure upper bounds are assigned from the first sibling
3775          * onwards.
3776          */
3777         first = rt6_multipath_first_sibling(rt);
3778         if (WARN_ON_ONCE(!first))
3779                 return;
3780
3781         total = rt6_multipath_total_weight(first);
3782         rt6_multipath_upper_bound_set(first, total);
3783 }
3784
3785 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3786 {
3787         const struct arg_netdev_event *arg = p_arg;
3788         const struct net *net = dev_net(arg->dev);
3789
3790         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3791                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3792                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3793                 rt6_multipath_rebalance(rt);
3794         }
3795
3796         return 0;
3797 }
3798
3799 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3800 {
3801         struct arg_netdev_event arg = {
3802                 .dev = dev,
3803                 {
3804                         .nh_flags = nh_flags,
3805                 },
3806         };
3807
3808         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3809                 arg.nh_flags |= RTNH_F_LINKDOWN;
3810
3811         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3812 }
3813
3814 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3815                                    const struct net_device *dev)
3816 {
3817         struct rt6_info *iter;
3818
3819         if (rt->dst.dev == dev)
3820                 return true;
3821         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3822                 if (iter->dst.dev == dev)
3823                         return true;
3824
3825         return false;
3826 }
3827
3828 static void rt6_multipath_flush(struct rt6_info *rt)
3829 {
3830         struct rt6_info *iter;
3831
3832         rt->should_flush = 1;
3833         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3834                 iter->should_flush = 1;
3835 }
3836
3837 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3838                                              const struct net_device *down_dev)
3839 {
3840         struct rt6_info *iter;
3841         unsigned int dead = 0;
3842
3843         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3844                 dead++;
3845         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3846                 if (iter->dst.dev == down_dev ||
3847                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3848                         dead++;
3849
3850         return dead;
3851 }
3852
3853 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3854                                        const struct net_device *dev,
3855                                        unsigned int nh_flags)
3856 {
3857         struct rt6_info *iter;
3858
3859         if (rt->dst.dev == dev)
3860                 rt->rt6i_nh_flags |= nh_flags;
3861         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3862                 if (iter->dst.dev == dev)
3863                         iter->rt6i_nh_flags |= nh_flags;
3864 }
3865
3866 /* called with write lock held for table with rt */
3867 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3868 {
3869         const struct arg_netdev_event *arg = p_arg;
3870         const struct net_device *dev = arg->dev;
3871         const struct net *net = dev_net(dev);
3872
3873         if (rt == net->ipv6.ip6_null_entry)
3874                 return 0;
3875
3876         switch (arg->event) {
3877         case NETDEV_UNREGISTER:
3878                 return rt->dst.dev == dev ? -1 : 0;
3879         case NETDEV_DOWN:
3880                 if (rt->should_flush)
3881                         return -1;
3882                 if (!rt->rt6i_nsiblings)
3883                         return rt->dst.dev == dev ? -1 : 0;
3884                 if (rt6_multipath_uses_dev(rt, dev)) {
3885                         unsigned int count;
3886
3887                         count = rt6_multipath_dead_count(rt, dev);
3888                         if (rt->rt6i_nsiblings + 1 == count) {
3889                                 rt6_multipath_flush(rt);
3890                                 return -1;
3891                         }
3892                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3893                                                    RTNH_F_LINKDOWN);
3894                         fib6_update_sernum(rt);
3895                         rt6_multipath_rebalance(rt);
3896                 }
3897                 return -2;
3898         case NETDEV_CHANGE:
3899                 if (rt->dst.dev != dev ||
3900                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3901                         break;
3902                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3903                 rt6_multipath_rebalance(rt);
3904                 break;
3905         }
3906
3907         return 0;
3908 }
3909
3910 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3911 {
3912         struct arg_netdev_event arg = {
3913                 .dev = dev,
3914                 {
3915                         .event = event,
3916                 },
3917         };
3918
3919         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3920 }
3921
3922 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3923 {
3924         rt6_sync_down_dev(dev, event);
3925         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3926         neigh_ifdown(&nd_tbl, dev);
3927 }
3928
3929 struct rt6_mtu_change_arg {
3930         struct net_device *dev;
3931         unsigned int mtu;
3932 };
3933
3934 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3935 {
3936         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3937         struct inet6_dev *idev;
3938
3939         /* In IPv6 pmtu discovery is not optional,
3940            so that RTAX_MTU lock cannot disable it.
3941            We still use this lock to block changes
3942            caused by addrconf/ndisc.
3943         */
3944
3945         idev = __in6_dev_get(arg->dev);
3946         if (!idev)
3947                 return 0;
3948
3949         /* For administrative MTU increase, there is no way to discover
3950            IPv6 PMTU increase, so PMTU increase should be updated here.
3951            Since RFC 1981 doesn't include administrative MTU increase
3952            update PMTU increase is a MUST. (i.e. jumbo frame)
3953          */
3954         if (rt->dst.dev == arg->dev &&
3955             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3956                 spin_lock_bh(&rt6_exception_lock);
3957                 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3958                     rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3959                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3960                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3961                 spin_unlock_bh(&rt6_exception_lock);
3962         }
3963         return 0;
3964 }
3965
3966 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3967 {
3968         struct rt6_mtu_change_arg arg = {
3969                 .dev = dev,
3970                 .mtu = mtu,
3971         };
3972
3973         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3974 }
3975
3976 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3977         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3978         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
3979         [RTA_OIF]               = { .type = NLA_U32 },
3980         [RTA_IIF]               = { .type = NLA_U32 },
3981         [RTA_PRIORITY]          = { .type = NLA_U32 },
3982         [RTA_METRICS]           = { .type = NLA_NESTED },
3983         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3984         [RTA_PREF]              = { .type = NLA_U8 },
3985         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3986         [RTA_ENCAP]             = { .type = NLA_NESTED },
3987         [RTA_EXPIRES]           = { .type = NLA_U32 },
3988         [RTA_UID]               = { .type = NLA_U32 },
3989         [RTA_MARK]              = { .type = NLA_U32 },
3990         [RTA_TABLE]             = { .type = NLA_U32 },
3991 };
3992
3993 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3994                               struct fib6_config *cfg,
3995                               struct netlink_ext_ack *extack)
3996 {
3997         struct rtmsg *rtm;
3998         struct nlattr *tb[RTA_MAX+1];
3999         unsigned int pref;
4000         int err;
4001
4002         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4003                           NULL);
4004         if (err < 0)
4005                 goto errout;
4006
4007         err = -EINVAL;
4008         rtm = nlmsg_data(nlh);
4009         memset(cfg, 0, sizeof(*cfg));
4010
4011         cfg->fc_table = rtm->rtm_table;
4012         cfg->fc_dst_len = rtm->rtm_dst_len;
4013         cfg->fc_src_len = rtm->rtm_src_len;
4014         cfg->fc_flags = RTF_UP;
4015         cfg->fc_protocol = rtm->rtm_protocol;
4016         cfg->fc_type = rtm->rtm_type;
4017
4018         if (rtm->rtm_type == RTN_UNREACHABLE ||
4019             rtm->rtm_type == RTN_BLACKHOLE ||
4020             rtm->rtm_type == RTN_PROHIBIT ||
4021             rtm->rtm_type == RTN_THROW)
4022                 cfg->fc_flags |= RTF_REJECT;
4023
4024         if (rtm->rtm_type == RTN_LOCAL)
4025                 cfg->fc_flags |= RTF_LOCAL;
4026
4027         if (rtm->rtm_flags & RTM_F_CLONED)
4028                 cfg->fc_flags |= RTF_CACHE;
4029
4030         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4031
4032         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4033         cfg->fc_nlinfo.nlh = nlh;
4034         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4035
4036         if (tb[RTA_GATEWAY]) {
4037                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4038                 cfg->fc_flags |= RTF_GATEWAY;
4039         }
4040
4041         if (tb[RTA_DST]) {
4042                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4043
4044                 if (nla_len(tb[RTA_DST]) < plen)
4045                         goto errout;
4046
4047                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4048         }
4049
4050         if (tb[RTA_SRC]) {
4051                 int plen = (rtm->rtm_src_len + 7) >> 3;
4052
4053                 if (nla_len(tb[RTA_SRC]) < plen)
4054                         goto errout;
4055
4056                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4057         }
4058
4059         if (tb[RTA_PREFSRC])
4060                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4061
4062         if (tb[RTA_OIF])
4063                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4064
4065         if (tb[RTA_PRIORITY])
4066                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4067
4068         if (tb[RTA_METRICS]) {
4069                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4070                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4071         }
4072
4073         if (tb[RTA_TABLE])
4074                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4075
4076         if (tb[RTA_MULTIPATH]) {
4077                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4078                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4079
4080                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4081                                                      cfg->fc_mp_len, extack);
4082                 if (err < 0)
4083                         goto errout;
4084         }
4085
4086         if (tb[RTA_PREF]) {
4087                 pref = nla_get_u8(tb[RTA_PREF]);
4088                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4089                     pref != ICMPV6_ROUTER_PREF_HIGH)
4090                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4091                 cfg->fc_flags |= RTF_PREF(pref);
4092         }
4093
4094         if (tb[RTA_ENCAP])
4095                 cfg->fc_encap = tb[RTA_ENCAP];
4096
4097         if (tb[RTA_ENCAP_TYPE]) {
4098                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4099
4100                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4101                 if (err < 0)
4102                         goto errout;
4103         }
4104
4105         if (tb[RTA_EXPIRES]) {
4106                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4107
4108                 if (addrconf_finite_timeout(timeout)) {
4109                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4110                         cfg->fc_flags |= RTF_EXPIRES;
4111                 }
4112         }
4113
4114         err = 0;
4115 errout:
4116         return err;
4117 }
4118
4119 struct rt6_nh {
4120         struct rt6_info *rt6_info;
4121         struct fib6_config r_cfg;
4122         struct mx6_config mxc;
4123         struct list_head next;
4124 };
4125
4126 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4127 {
4128         struct rt6_nh *nh;
4129
4130         list_for_each_entry(nh, rt6_nh_list, next) {
4131                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4132                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4133                         nh->r_cfg.fc_ifindex);
4134         }
4135 }
4136
4137 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4138                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4139 {
4140         struct rt6_nh *nh;
4141         int err = -EEXIST;
4142
4143         list_for_each_entry(nh, rt6_nh_list, next) {
4144                 /* check if rt6_info already exists */
4145                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4146                         return err;
4147         }
4148
4149         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4150         if (!nh)
4151                 return -ENOMEM;
4152         nh->rt6_info = rt;
4153         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4154         if (err) {
4155                 kfree(nh);
4156                 return err;
4157         }
4158         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4159         list_add_tail(&nh->next, rt6_nh_list);
4160
4161         return 0;
4162 }
4163
4164 static void ip6_route_mpath_notify(struct rt6_info *rt,
4165                                    struct rt6_info *rt_last,
4166                                    struct nl_info *info,
4167                                    __u16 nlflags)
4168 {
4169         /* if this is an APPEND route, then rt points to the first route
4170          * inserted and rt_last points to last route inserted. Userspace
4171          * wants a consistent dump of the route which starts at the first
4172          * nexthop. Since sibling routes are always added at the end of
4173          * the list, find the first sibling of the last route appended
4174          */
4175         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4176                 rt = list_first_entry(&rt_last->rt6i_siblings,
4177                                       struct rt6_info,
4178                                       rt6i_siblings);
4179         }
4180
4181         if (rt)
4182                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4183 }
4184
4185 static int ip6_route_multipath_add(struct fib6_config *cfg,
4186                                    struct netlink_ext_ack *extack)
4187 {
4188         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4189         struct nl_info *info = &cfg->fc_nlinfo;
4190         struct fib6_config r_cfg;
4191         struct rtnexthop *rtnh;
4192         struct rt6_info *rt;
4193         struct rt6_nh *err_nh;
4194         struct rt6_nh *nh, *nh_safe;
4195         __u16 nlflags;
4196         int remaining;
4197         int attrlen;
4198         int err = 1;
4199         int nhn = 0;
4200         int replace = (cfg->fc_nlinfo.nlh &&
4201                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4202         LIST_HEAD(rt6_nh_list);
4203
4204         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4205         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4206                 nlflags |= NLM_F_APPEND;
4207
4208         remaining = cfg->fc_mp_len;
4209         rtnh = (struct rtnexthop *)cfg->fc_mp;
4210
4211         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4212          * rt6_info structs per nexthop
4213          */
4214         while (rtnh_ok(rtnh, remaining)) {
4215                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4216                 if (rtnh->rtnh_ifindex)
4217                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4218
4219                 attrlen = rtnh_attrlen(rtnh);
4220                 if (attrlen > 0) {
4221                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4222
4223                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4224                         if (nla) {
4225                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4226                                 r_cfg.fc_flags |= RTF_GATEWAY;
4227                         }
4228                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4229                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4230                         if (nla)
4231                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4232                 }
4233
4234                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4235                 rt = ip6_route_info_create(&r_cfg, extack);
4236                 if (IS_ERR(rt)) {
4237                         err = PTR_ERR(rt);
4238                         rt = NULL;
4239                         goto cleanup;
4240                 }
4241
4242                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4243
4244                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4245                 if (err) {
4246                         dst_release_immediate(&rt->dst);
4247                         goto cleanup;
4248                 }
4249
4250                 rtnh = rtnh_next(rtnh, &remaining);
4251         }
4252
4253         /* for add and replace send one notification with all nexthops.
4254          * Skip the notification in fib6_add_rt2node and send one with
4255          * the full route when done
4256          */
4257         info->skip_notify = 1;
4258
4259         err_nh = NULL;
4260         list_for_each_entry(nh, &rt6_nh_list, next) {
4261                 rt_last = nh->rt6_info;
4262                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4263                 /* save reference to first route for notification */
4264                 if (!rt_notif && !err)
4265                         rt_notif = nh->rt6_info;
4266
4267                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4268                 nh->rt6_info = NULL;
4269                 if (err) {
4270                         if (replace && nhn)
4271                                 ip6_print_replace_route_err(&rt6_nh_list);
4272                         err_nh = nh;
4273                         goto add_errout;
4274                 }
4275
4276                 /* Because each route is added like a single route we remove
4277                  * these flags after the first nexthop: if there is a collision,
4278                  * we have already failed to add the first nexthop:
4279                  * fib6_add_rt2node() has rejected it; when replacing, old
4280                  * nexthops have been replaced by first new, the rest should
4281                  * be added to it.
4282                  */
4283                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4284                                                      NLM_F_REPLACE);
4285                 nhn++;
4286         }
4287
4288         /* success ... tell user about new route */
4289         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4290         goto cleanup;
4291
4292 add_errout:
4293         /* send notification for routes that were added so that
4294          * the delete notifications sent by ip6_route_del are
4295          * coherent
4296          */
4297         if (rt_notif)
4298                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4299
4300         /* Delete routes that were already added */
4301         list_for_each_entry(nh, &rt6_nh_list, next) {
4302                 if (err_nh == nh)
4303                         break;
4304                 ip6_route_del(&nh->r_cfg, extack);
4305         }
4306
4307 cleanup:
4308         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4309                 if (nh->rt6_info)
4310                         dst_release_immediate(&nh->rt6_info->dst);
4311                 kfree(nh->mxc.mx);
4312                 list_del(&nh->next);
4313                 kfree(nh);
4314         }
4315
4316         return err;
4317 }
4318
4319 static int ip6_route_multipath_del(struct fib6_config *cfg,
4320                                    struct netlink_ext_ack *extack)
4321 {
4322         struct fib6_config r_cfg;
4323         struct rtnexthop *rtnh;
4324         int remaining;
4325         int attrlen;
4326         int err = 1, last_err = 0;
4327
4328         remaining = cfg->fc_mp_len;
4329         rtnh = (struct rtnexthop *)cfg->fc_mp;
4330
4331         /* Parse a Multipath Entry */
4332         while (rtnh_ok(rtnh, remaining)) {
4333                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4334                 if (rtnh->rtnh_ifindex)
4335                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4336
4337                 attrlen = rtnh_attrlen(rtnh);
4338                 if (attrlen > 0) {
4339                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4340
4341                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4342                         if (nla) {
4343                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4344                                 r_cfg.fc_flags |= RTF_GATEWAY;
4345                         }
4346                 }
4347                 err = ip6_route_del(&r_cfg, extack);
4348                 if (err)
4349                         last_err = err;
4350
4351                 rtnh = rtnh_next(rtnh, &remaining);
4352         }
4353
4354         return last_err;
4355 }
4356
4357 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4358                               struct netlink_ext_ack *extack)
4359 {
4360         struct fib6_config cfg;
4361         int err;
4362
4363         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4364         if (err < 0)
4365                 return err;
4366
4367         if (cfg.fc_mp)
4368                 return ip6_route_multipath_del(&cfg, extack);
4369         else {
4370                 cfg.fc_delete_all_nh = 1;
4371                 return ip6_route_del(&cfg, extack);
4372         }
4373 }
4374
4375 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4376                               struct netlink_ext_ack *extack)
4377 {
4378         struct fib6_config cfg;
4379         int err;
4380
4381         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4382         if (err < 0)
4383                 return err;
4384
4385         if (cfg.fc_mp)
4386                 return ip6_route_multipath_add(&cfg, extack);
4387         else
4388                 return ip6_route_add(&cfg, extack);
4389 }
4390
4391 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4392 {
4393         int nexthop_len = 0;
4394
4395         if (rt->rt6i_nsiblings) {
4396                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4397                             + NLA_ALIGN(sizeof(struct rtnexthop))
4398                             + nla_total_size(16) /* RTA_GATEWAY */
4399                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4400
4401                 nexthop_len *= rt->rt6i_nsiblings;
4402         }
4403
4404         return NLMSG_ALIGN(sizeof(struct rtmsg))
4405                + nla_total_size(16) /* RTA_SRC */
4406                + nla_total_size(16) /* RTA_DST */
4407                + nla_total_size(16) /* RTA_GATEWAY */
4408                + nla_total_size(16) /* RTA_PREFSRC */
4409                + nla_total_size(4) /* RTA_TABLE */
4410                + nla_total_size(4) /* RTA_IIF */
4411                + nla_total_size(4) /* RTA_OIF */
4412                + nla_total_size(4) /* RTA_PRIORITY */
4413                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4414                + nla_total_size(sizeof(struct rta_cacheinfo))
4415                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4416                + nla_total_size(1) /* RTA_PREF */
4417                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4418                + nexthop_len;
4419 }
4420
4421 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4422                             unsigned int *flags, bool skip_oif)
4423 {
4424         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4425                 *flags |= RTNH_F_DEAD;
4426
4427         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4428                 *flags |= RTNH_F_LINKDOWN;
4429                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4430                         *flags |= RTNH_F_DEAD;
4431         }
4432
4433         if (rt->rt6i_flags & RTF_GATEWAY) {
4434                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4435                         goto nla_put_failure;
4436         }
4437
4438         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4439         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4440                 *flags |= RTNH_F_OFFLOAD;
4441
4442         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4443         if (!skip_oif && rt->dst.dev &&
4444             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4445                 goto nla_put_failure;
4446
4447         if (rt->dst.lwtstate &&
4448             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4449                 goto nla_put_failure;
4450
4451         return 0;
4452
4453 nla_put_failure:
4454         return -EMSGSIZE;
4455 }
4456
4457 /* add multipath next hop */
4458 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4459 {
4460         struct rtnexthop *rtnh;
4461         unsigned int flags = 0;
4462
4463         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4464         if (!rtnh)
4465                 goto nla_put_failure;
4466
4467         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4468         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4469
4470         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4471                 goto nla_put_failure;
4472
4473         rtnh->rtnh_flags = flags;
4474
4475         /* length of rtnetlink header + attributes */
4476         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4477
4478         return 0;
4479
4480 nla_put_failure:
4481         return -EMSGSIZE;
4482 }
4483
4484 static int rt6_fill_node(struct net *net,
4485                          struct sk_buff *skb, struct rt6_info *rt,
4486                          struct in6_addr *dst, struct in6_addr *src,
4487                          int iif, int type, u32 portid, u32 seq,
4488                          unsigned int flags)
4489 {
4490         u32 metrics[RTAX_MAX];
4491         struct rtmsg *rtm;
4492         struct nlmsghdr *nlh;
4493         long expires;
4494         u32 table;
4495
4496         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4497         if (!nlh)
4498                 return -EMSGSIZE;
4499
4500         rtm = nlmsg_data(nlh);
4501         rtm->rtm_family = AF_INET6;
4502         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4503         rtm->rtm_src_len = rt->rt6i_src.plen;
4504         rtm->rtm_tos = 0;
4505         if (rt->rt6i_table)
4506                 table = rt->rt6i_table->tb6_id;
4507         else
4508                 table = RT6_TABLE_UNSPEC;
4509         rtm->rtm_table = table;
4510         if (nla_put_u32(skb, RTA_TABLE, table))
4511                 goto nla_put_failure;
4512         if (rt->rt6i_flags & RTF_REJECT) {
4513                 switch (rt->dst.error) {
4514                 case -EINVAL:
4515                         rtm->rtm_type = RTN_BLACKHOLE;
4516                         break;
4517                 case -EACCES:
4518                         rtm->rtm_type = RTN_PROHIBIT;
4519                         break;
4520                 case -EAGAIN:
4521                         rtm->rtm_type = RTN_THROW;
4522                         break;
4523                 default:
4524                         rtm->rtm_type = RTN_UNREACHABLE;
4525                         break;
4526                 }
4527         }
4528         else if (rt->rt6i_flags & RTF_LOCAL)
4529                 rtm->rtm_type = RTN_LOCAL;
4530         else if (rt->rt6i_flags & RTF_ANYCAST)
4531                 rtm->rtm_type = RTN_ANYCAST;
4532         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4533                 rtm->rtm_type = RTN_LOCAL;
4534         else
4535                 rtm->rtm_type = RTN_UNICAST;
4536         rtm->rtm_flags = 0;
4537         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4538         rtm->rtm_protocol = rt->rt6i_protocol;
4539
4540         if (rt->rt6i_flags & RTF_CACHE)
4541                 rtm->rtm_flags |= RTM_F_CLONED;
4542
4543         if (dst) {
4544                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4545                         goto nla_put_failure;
4546                 rtm->rtm_dst_len = 128;
4547         } else if (rtm->rtm_dst_len)
4548                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4549                         goto nla_put_failure;
4550 #ifdef CONFIG_IPV6_SUBTREES
4551         if (src) {
4552                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4553                         goto nla_put_failure;
4554                 rtm->rtm_src_len = 128;
4555         } else if (rtm->rtm_src_len &&
4556                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4557                 goto nla_put_failure;
4558 #endif
4559         if (iif) {
4560 #ifdef CONFIG_IPV6_MROUTE
4561                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4562                         int err = ip6mr_get_route(net, skb, rtm, portid);
4563
4564                         if (err == 0)
4565                                 return 0;
4566                         if (err < 0)
4567                                 goto nla_put_failure;
4568                 } else
4569 #endif
4570                         if (nla_put_u32(skb, RTA_IIF, iif))
4571                                 goto nla_put_failure;
4572         } else if (dst) {
4573                 struct in6_addr saddr_buf;
4574                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4575                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4576                         goto nla_put_failure;
4577         }
4578
4579         if (rt->rt6i_prefsrc.plen) {
4580                 struct in6_addr saddr_buf;
4581                 saddr_buf = rt->rt6i_prefsrc.addr;
4582                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4583                         goto nla_put_failure;
4584         }
4585
4586         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4587         if (rt->rt6i_pmtu)
4588                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4589         if (rtnetlink_put_metrics(skb, metrics) < 0)
4590                 goto nla_put_failure;
4591
4592         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4593                 goto nla_put_failure;
4594
4595         /* For multipath routes, walk the siblings list and add
4596          * each as a nexthop within RTA_MULTIPATH.
4597          */
4598         if (rt->rt6i_nsiblings) {
4599                 struct rt6_info *sibling, *next_sibling;
4600                 struct nlattr *mp;
4601
4602                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4603                 if (!mp)
4604                         goto nla_put_failure;
4605
4606                 if (rt6_add_nexthop(skb, rt) < 0)
4607                         goto nla_put_failure;
4608
4609                 list_for_each_entry_safe(sibling, next_sibling,
4610                                          &rt->rt6i_siblings, rt6i_siblings) {
4611                         if (rt6_add_nexthop(skb, sibling) < 0)
4612                                 goto nla_put_failure;
4613                 }
4614
4615                 nla_nest_end(skb, mp);
4616         } else {
4617                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4618                         goto nla_put_failure;
4619         }
4620
4621         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4622
4623         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4624                 goto nla_put_failure;
4625
4626         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4627                 goto nla_put_failure;
4628
4629
4630         nlmsg_end(skb, nlh);
4631         return 0;
4632
4633 nla_put_failure:
4634         nlmsg_cancel(skb, nlh);
4635         return -EMSGSIZE;
4636 }
4637
4638 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4639 {
4640         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4641         struct net *net = arg->net;
4642
4643         if (rt == net->ipv6.ip6_null_entry)
4644                 return 0;
4645
4646         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4647                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4648
4649                 /* user wants prefix routes only */
4650                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4651                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4652                         /* success since this is not a prefix route */
4653                         return 1;
4654                 }
4655         }
4656
4657         return rt6_fill_node(net,
4658                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4659                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4660                      NLM_F_MULTI);
4661 }
4662
4663 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4664                               struct netlink_ext_ack *extack)
4665 {
4666         struct net *net = sock_net(in_skb->sk);
4667         struct nlattr *tb[RTA_MAX+1];
4668         int err, iif = 0, oif = 0;
4669         struct dst_entry *dst;
4670         struct rt6_info *rt;
4671         struct sk_buff *skb;
4672         struct rtmsg *rtm;
4673         struct flowi6 fl6;
4674         bool fibmatch;
4675
4676         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4677                           extack);
4678         if (err < 0)
4679                 goto errout;
4680
4681         err = -EINVAL;
4682         memset(&fl6, 0, sizeof(fl6));
4683         rtm = nlmsg_data(nlh);
4684         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4685         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4686
4687         if (tb[RTA_SRC]) {
4688                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4689                         goto errout;
4690
4691                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4692         }
4693
4694         if (tb[RTA_DST]) {
4695                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4696                         goto errout;
4697
4698                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4699         }
4700
4701         if (tb[RTA_IIF])
4702                 iif = nla_get_u32(tb[RTA_IIF]);
4703
4704         if (tb[RTA_OIF])
4705                 oif = nla_get_u32(tb[RTA_OIF]);
4706
4707         if (tb[RTA_MARK])
4708                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4709
4710         if (tb[RTA_UID])
4711                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4712                                            nla_get_u32(tb[RTA_UID]));
4713         else
4714                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4715
4716         if (iif) {
4717                 struct net_device *dev;
4718                 int flags = 0;
4719
4720                 rcu_read_lock();
4721
4722                 dev = dev_get_by_index_rcu(net, iif);
4723                 if (!dev) {
4724                         rcu_read_unlock();
4725                         err = -ENODEV;
4726                         goto errout;
4727                 }
4728
4729                 fl6.flowi6_iif = iif;
4730
4731                 if (!ipv6_addr_any(&fl6.saddr))
4732                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4733
4734                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4735
4736                 rcu_read_unlock();
4737         } else {
4738                 fl6.flowi6_oif = oif;
4739
4740                 dst = ip6_route_output(net, NULL, &fl6);
4741         }
4742
4743
4744         rt = container_of(dst, struct rt6_info, dst);
4745         if (rt->dst.error) {
4746                 err = rt->dst.error;
4747                 ip6_rt_put(rt);
4748                 goto errout;
4749         }
4750
4751         if (rt == net->ipv6.ip6_null_entry) {
4752                 err = rt->dst.error;
4753                 ip6_rt_put(rt);
4754                 goto errout;
4755         }
4756
4757         if (fibmatch && rt->from) {
4758                 struct rt6_info *ort = rt->from;
4759
4760                 dst_hold(&ort->dst);
4761                 ip6_rt_put(rt);
4762                 rt = ort;
4763         }
4764
4765         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4766         if (!skb) {
4767                 ip6_rt_put(rt);
4768                 err = -ENOBUFS;
4769                 goto errout;
4770         }
4771
4772         skb_dst_set(skb, &rt->dst);
4773         if (fibmatch)
4774                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4775                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4776                                     nlh->nlmsg_seq, 0);
4777         else
4778                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4779                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4780                                     nlh->nlmsg_seq, 0);
4781         if (err < 0) {
4782                 kfree_skb(skb);
4783                 goto errout;
4784         }
4785
4786         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4787 errout:
4788         return err;
4789 }
4790
4791 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4792                      unsigned int nlm_flags)
4793 {
4794         struct sk_buff *skb;
4795         struct net *net = info->nl_net;
4796         u32 seq;
4797         int err;
4798
4799         err = -ENOBUFS;
4800         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4801
4802         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4803         if (!skb)
4804                 goto errout;
4805
4806         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4807                                 event, info->portid, seq, nlm_flags);
4808         if (err < 0) {
4809                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4810                 WARN_ON(err == -EMSGSIZE);
4811                 kfree_skb(skb);
4812                 goto errout;
4813         }
4814         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4815                     info->nlh, gfp_any());
4816         return;
4817 errout:
4818         if (err < 0)
4819                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4820 }
4821
4822 static int ip6_route_dev_notify(struct notifier_block *this,
4823                                 unsigned long event, void *ptr)
4824 {
4825         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4826         struct net *net = dev_net(dev);
4827
4828         if (!(dev->flags & IFF_LOOPBACK))
4829                 return NOTIFY_OK;
4830
4831         if (event == NETDEV_REGISTER) {
4832                 net->ipv6.ip6_null_entry->dst.dev = dev;
4833                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4834 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4835                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4836                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4837                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4838                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4839 #endif
4840          } else if (event == NETDEV_UNREGISTER &&
4841                     dev->reg_state != NETREG_UNREGISTERED) {
4842                 /* NETDEV_UNREGISTER could be fired for multiple times by
4843                  * netdev_wait_allrefs(). Make sure we only call this once.
4844                  */
4845                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4846 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4847                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4848                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4849 #endif
4850         }
4851
4852         return NOTIFY_OK;
4853 }
4854
4855 /*
4856  *      /proc
4857  */
4858
4859 #ifdef CONFIG_PROC_FS
4860
4861 static const struct file_operations ipv6_route_proc_fops = {
4862         .open           = ipv6_route_open,
4863         .read           = seq_read,
4864         .llseek         = seq_lseek,
4865         .release        = seq_release_net,
4866 };
4867
4868 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4869 {
4870         struct net *net = (struct net *)seq->private;
4871         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4872                    net->ipv6.rt6_stats->fib_nodes,
4873                    net->ipv6.rt6_stats->fib_route_nodes,
4874                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4875                    net->ipv6.rt6_stats->fib_rt_entries,
4876                    net->ipv6.rt6_stats->fib_rt_cache,
4877                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4878                    net->ipv6.rt6_stats->fib_discarded_routes);
4879
4880         return 0;
4881 }
4882
4883 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4884 {
4885         return single_open_net(inode, file, rt6_stats_seq_show);
4886 }
4887
4888 static const struct file_operations rt6_stats_seq_fops = {
4889         .open    = rt6_stats_seq_open,
4890         .read    = seq_read,
4891         .llseek  = seq_lseek,
4892         .release = single_release_net,
4893 };
4894 #endif  /* CONFIG_PROC_FS */
4895
4896 #ifdef CONFIG_SYSCTL
4897
4898 static
4899 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4900                               void __user *buffer, size_t *lenp, loff_t *ppos)
4901 {
4902         struct net *net;
4903         int delay;
4904         if (!write)
4905                 return -EINVAL;
4906
4907         net = (struct net *)ctl->extra1;
4908         delay = net->ipv6.sysctl.flush_delay;
4909         proc_dointvec(ctl, write, buffer, lenp, ppos);
4910         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4911         return 0;
4912 }
4913
4914 struct ctl_table ipv6_route_table_template[] = {
4915         {
4916                 .procname       =       "flush",
4917                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4918                 .maxlen         =       sizeof(int),
4919                 .mode           =       0200,
4920                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4921         },
4922         {
4923                 .procname       =       "gc_thresh",
4924                 .data           =       &ip6_dst_ops_template.gc_thresh,
4925                 .maxlen         =       sizeof(int),
4926                 .mode           =       0644,
4927                 .proc_handler   =       proc_dointvec,
4928         },
4929         {
4930                 .procname       =       "max_size",
4931                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4932                 .maxlen         =       sizeof(int),
4933                 .mode           =       0644,
4934                 .proc_handler   =       proc_dointvec,
4935         },
4936         {
4937                 .procname       =       "gc_min_interval",
4938                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4939                 .maxlen         =       sizeof(int),
4940                 .mode           =       0644,
4941                 .proc_handler   =       proc_dointvec_jiffies,
4942         },
4943         {
4944                 .procname       =       "gc_timeout",
4945                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4946                 .maxlen         =       sizeof(int),
4947                 .mode           =       0644,
4948                 .proc_handler   =       proc_dointvec_jiffies,
4949         },
4950         {
4951                 .procname       =       "gc_interval",
4952                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4953                 .maxlen         =       sizeof(int),
4954                 .mode           =       0644,
4955                 .proc_handler   =       proc_dointvec_jiffies,
4956         },
4957         {
4958                 .procname       =       "gc_elasticity",
4959                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4960                 .maxlen         =       sizeof(int),
4961                 .mode           =       0644,
4962                 .proc_handler   =       proc_dointvec,
4963         },
4964         {
4965                 .procname       =       "mtu_expires",
4966                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4967                 .maxlen         =       sizeof(int),
4968                 .mode           =       0644,
4969                 .proc_handler   =       proc_dointvec_jiffies,
4970         },
4971         {
4972                 .procname       =       "min_adv_mss",
4973                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4974                 .maxlen         =       sizeof(int),
4975                 .mode           =       0644,
4976                 .proc_handler   =       proc_dointvec,
4977         },
4978         {
4979                 .procname       =       "gc_min_interval_ms",
4980                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4981                 .maxlen         =       sizeof(int),
4982                 .mode           =       0644,
4983                 .proc_handler   =       proc_dointvec_ms_jiffies,
4984         },
4985         { }
4986 };
4987
4988 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4989 {
4990         struct ctl_table *table;
4991
4992         table = kmemdup(ipv6_route_table_template,
4993                         sizeof(ipv6_route_table_template),
4994                         GFP_KERNEL);
4995
4996         if (table) {
4997                 table[0].data = &net->ipv6.sysctl.flush_delay;
4998                 table[0].extra1 = net;
4999                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5000                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5001                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5002                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5003                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5004                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5005                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5006                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5007                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5008
5009                 /* Don't export sysctls to unprivileged users */
5010                 if (net->user_ns != &init_user_ns)
5011                         table[0].procname = NULL;
5012         }
5013
5014         return table;
5015 }
5016 #endif
5017
5018 static int __net_init ip6_route_net_init(struct net *net)
5019 {
5020         int ret = -ENOMEM;
5021
5022         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5023                sizeof(net->ipv6.ip6_dst_ops));
5024
5025         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5026                 goto out_ip6_dst_ops;
5027
5028         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5029                                            sizeof(*net->ipv6.ip6_null_entry),
5030                                            GFP_KERNEL);
5031         if (!net->ipv6.ip6_null_entry)
5032                 goto out_ip6_dst_entries;
5033         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5034         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5035                          ip6_template_metrics, true);
5036
5037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5038         net->ipv6.fib6_has_custom_rules = false;
5039         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5040                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5041                                                GFP_KERNEL);
5042         if (!net->ipv6.ip6_prohibit_entry)
5043                 goto out_ip6_null_entry;
5044         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5045         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5046                          ip6_template_metrics, true);
5047
5048         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5049                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5050                                                GFP_KERNEL);
5051         if (!net->ipv6.ip6_blk_hole_entry)
5052                 goto out_ip6_prohibit_entry;
5053         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5054         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5055                          ip6_template_metrics, true);
5056 #endif
5057
5058         net->ipv6.sysctl.flush_delay = 0;
5059         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5060         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5061         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5062         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5063         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5064         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5065         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5066
5067         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5068
5069         ret = 0;
5070 out:
5071         return ret;
5072
5073 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5074 out_ip6_prohibit_entry:
5075         kfree(net->ipv6.ip6_prohibit_entry);
5076 out_ip6_null_entry:
5077         kfree(net->ipv6.ip6_null_entry);
5078 #endif
5079 out_ip6_dst_entries:
5080         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5081 out_ip6_dst_ops:
5082         goto out;
5083 }
5084
5085 static void __net_exit ip6_route_net_exit(struct net *net)
5086 {
5087         kfree(net->ipv6.ip6_null_entry);
5088 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5089         kfree(net->ipv6.ip6_prohibit_entry);
5090         kfree(net->ipv6.ip6_blk_hole_entry);
5091 #endif
5092         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5093 }
5094
5095 static int __net_init ip6_route_net_init_late(struct net *net)
5096 {
5097 #ifdef CONFIG_PROC_FS
5098         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5099         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5100 #endif
5101         return 0;
5102 }
5103
5104 static void __net_exit ip6_route_net_exit_late(struct net *net)
5105 {
5106 #ifdef CONFIG_PROC_FS
5107         remove_proc_entry("ipv6_route", net->proc_net);
5108         remove_proc_entry("rt6_stats", net->proc_net);
5109 #endif
5110 }
5111
5112 static struct pernet_operations ip6_route_net_ops = {
5113         .init = ip6_route_net_init,
5114         .exit = ip6_route_net_exit,
5115 };
5116
5117 static int __net_init ipv6_inetpeer_init(struct net *net)
5118 {
5119         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5120
5121         if (!bp)
5122                 return -ENOMEM;
5123         inet_peer_base_init(bp);
5124         net->ipv6.peers = bp;
5125         return 0;
5126 }
5127
5128 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5129 {
5130         struct inet_peer_base *bp = net->ipv6.peers;
5131
5132         net->ipv6.peers = NULL;
5133         inetpeer_invalidate_tree(bp);
5134         kfree(bp);
5135 }
5136
5137 static struct pernet_operations ipv6_inetpeer_ops = {
5138         .init   =       ipv6_inetpeer_init,
5139         .exit   =       ipv6_inetpeer_exit,
5140 };
5141
5142 static struct pernet_operations ip6_route_net_late_ops = {
5143         .init = ip6_route_net_init_late,
5144         .exit = ip6_route_net_exit_late,
5145 };
5146
5147 static struct notifier_block ip6_route_dev_notifier = {
5148         .notifier_call = ip6_route_dev_notify,
5149         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5150 };
5151
5152 void __init ip6_route_init_special_entries(void)
5153 {
5154         /* Registering of the loopback is done before this portion of code,
5155          * the loopback reference in rt6_info will not be taken, do it
5156          * manually for init_net */
5157         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5158         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5159   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5160         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5161         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5162         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5163         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5164   #endif
5165 }
5166
5167 int __init ip6_route_init(void)
5168 {
5169         int ret;
5170         int cpu;
5171
5172         ret = -ENOMEM;
5173         ip6_dst_ops_template.kmem_cachep =
5174                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5175                                   SLAB_HWCACHE_ALIGN, NULL);
5176         if (!ip6_dst_ops_template.kmem_cachep)
5177                 goto out;
5178
5179         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5180         if (ret)
5181                 goto out_kmem_cache;
5182
5183         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5184         if (ret)
5185                 goto out_dst_entries;
5186
5187         ret = register_pernet_subsys(&ip6_route_net_ops);
5188         if (ret)
5189                 goto out_register_inetpeer;
5190
5191         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5192
5193         ret = fib6_init();
5194         if (ret)
5195                 goto out_register_subsys;
5196
5197         ret = xfrm6_init();
5198         if (ret)
5199                 goto out_fib6_init;
5200
5201         ret = fib6_rules_init();
5202         if (ret)
5203                 goto xfrm6_init;
5204
5205         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5206         if (ret)
5207                 goto fib6_rules_init;
5208
5209         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5210                                    inet6_rtm_newroute, NULL, 0);
5211         if (ret < 0)
5212                 goto out_register_late_subsys;
5213
5214         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5215                                    inet6_rtm_delroute, NULL, 0);
5216         if (ret < 0)
5217                 goto out_register_late_subsys;
5218
5219         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5220                                    inet6_rtm_getroute, NULL,
5221                                    RTNL_FLAG_DOIT_UNLOCKED);
5222         if (ret < 0)
5223                 goto out_register_late_subsys;
5224
5225         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5226         if (ret)
5227                 goto out_register_late_subsys;
5228
5229         for_each_possible_cpu(cpu) {
5230                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5231
5232                 INIT_LIST_HEAD(&ul->head);
5233                 spin_lock_init(&ul->lock);
5234         }
5235
5236 out:
5237         return ret;
5238
5239 out_register_late_subsys:
5240         rtnl_unregister_all(PF_INET6);
5241         unregister_pernet_subsys(&ip6_route_net_late_ops);
5242 fib6_rules_init:
5243         fib6_rules_cleanup();
5244 xfrm6_init:
5245         xfrm6_fini();
5246 out_fib6_init:
5247         fib6_gc_cleanup();
5248 out_register_subsys:
5249         unregister_pernet_subsys(&ip6_route_net_ops);
5250 out_register_inetpeer:
5251         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5252 out_dst_entries:
5253         dst_entries_destroy(&ip6_dst_blackhole_ops);
5254 out_kmem_cache:
5255         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5256         goto out;
5257 }
5258
5259 void ip6_route_cleanup(void)
5260 {
5261         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5262         unregister_pernet_subsys(&ip6_route_net_late_ops);
5263         fib6_rules_cleanup();
5264         xfrm6_fini();
5265         fib6_gc_cleanup();
5266         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5267         unregister_pernet_subsys(&ip6_route_net_ops);
5268         dst_entries_destroy(&ip6_dst_blackhole_ops);
5269         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5270 }