]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: Refactor fib6_ignore_linkdown
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !rt->fib6_nh.fib_nh_has_gw)
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !rt->fib6_nh.fib_nh_has_gw)
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
643                                    int *mpri, struct fib6_info *match,
644                                    bool *do_rr)
645 {
646         int m;
647         bool match_do_rr = false;
648
649         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
650                 goto out;
651
652         if (ip6_ignore_linkdown(rt->fib6_nh.nh_dev) &&
653             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
654             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
655                 goto out;
656
657         if (fib6_check_expired(rt))
658                 goto out;
659
660         m = rt6_score_route(rt, oif, strict);
661         if (m == RT6_NUD_FAIL_DO_RR) {
662                 match_do_rr = true;
663                 m = 0; /* lowest valid score */
664         } else if (m == RT6_NUD_FAIL_HARD) {
665                 goto out;
666         }
667
668         if (strict & RT6_LOOKUP_F_REACHABLE)
669                 rt6_probe(rt);
670
671         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
672         if (m > *mpri) {
673                 *do_rr = match_do_rr;
674                 *mpri = m;
675                 match = rt;
676         }
677 out:
678         return match;
679 }
680
681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
682                                      struct fib6_info *leaf,
683                                      struct fib6_info *rr_head,
684                                      u32 metric, int oif, int strict,
685                                      bool *do_rr)
686 {
687         struct fib6_info *rt, *match, *cont;
688         int mpri = -1;
689
690         match = NULL;
691         cont = NULL;
692         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
693                 if (rt->fib6_metric != metric) {
694                         cont = rt;
695                         break;
696                 }
697
698                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
699         }
700
701         for (rt = leaf; rt && rt != rr_head;
702              rt = rcu_dereference(rt->fib6_next)) {
703                 if (rt->fib6_metric != metric) {
704                         cont = rt;
705                         break;
706                 }
707
708                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
709         }
710
711         if (match || !cont)
712                 return match;
713
714         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716
717         return match;
718 }
719
720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
721                                    int oif, int strict)
722 {
723         struct fib6_info *leaf = rcu_dereference(fn->leaf);
724         struct fib6_info *match, *rt0;
725         bool do_rr = false;
726         int key_plen;
727
728         if (!leaf || leaf == net->ipv6.fib6_null_entry)
729                 return net->ipv6.fib6_null_entry;
730
731         rt0 = rcu_dereference(fn->rr_ptr);
732         if (!rt0)
733                 rt0 = leaf;
734
735         /* Double check to make sure fn is not an intermediate node
736          * and fn->leaf does not points to its child's leaf
737          * (This might happen if all routes under fn are deleted from
738          * the tree and fib6_repair_tree() is called on the node.)
739          */
740         key_plen = rt0->fib6_dst.plen;
741 #ifdef CONFIG_IPV6_SUBTREES
742         if (rt0->fib6_src.plen)
743                 key_plen = rt0->fib6_src.plen;
744 #endif
745         if (fn->fn_bit != key_plen)
746                 return net->ipv6.fib6_null_entry;
747
748         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
749                              &do_rr);
750
751         if (do_rr) {
752                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
753
754                 /* no entries matched; do round-robin */
755                 if (!next || next->fib6_metric != rt0->fib6_metric)
756                         next = leaf;
757
758                 if (next != rt0) {
759                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
760                         /* make sure next is not being deleted from the tree */
761                         if (next->fib6_node)
762                                 rcu_assign_pointer(fn->rr_ptr, next);
763                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
764                 }
765         }
766
767         return match ? match : net->ipv6.fib6_null_entry;
768 }
769
770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
771 {
772         return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw;
773 }
774
775 #ifdef CONFIG_IPV6_ROUTE_INFO
776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777                   const struct in6_addr *gwaddr)
778 {
779         struct net *net = dev_net(dev);
780         struct route_info *rinfo = (struct route_info *) opt;
781         struct in6_addr prefix_buf, *prefix;
782         unsigned int pref;
783         unsigned long lifetime;
784         struct fib6_info *rt;
785
786         if (len < sizeof(struct route_info)) {
787                 return -EINVAL;
788         }
789
790         /* Sanity check for prefix_len and length */
791         if (rinfo->length > 3) {
792                 return -EINVAL;
793         } else if (rinfo->prefix_len > 128) {
794                 return -EINVAL;
795         } else if (rinfo->prefix_len > 64) {
796                 if (rinfo->length < 2) {
797                         return -EINVAL;
798                 }
799         } else if (rinfo->prefix_len > 0) {
800                 if (rinfo->length < 1) {
801                         return -EINVAL;
802                 }
803         }
804
805         pref = rinfo->route_pref;
806         if (pref == ICMPV6_ROUTER_PREF_INVALID)
807                 return -EINVAL;
808
809         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
810
811         if (rinfo->length == 3)
812                 prefix = (struct in6_addr *)rinfo->prefix;
813         else {
814                 /* this function is safe */
815                 ipv6_addr_prefix(&prefix_buf,
816                                  (struct in6_addr *)rinfo->prefix,
817                                  rinfo->prefix_len);
818                 prefix = &prefix_buf;
819         }
820
821         if (rinfo->prefix_len == 0)
822                 rt = rt6_get_dflt_router(net, gwaddr, dev);
823         else
824                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
825                                         gwaddr, dev);
826
827         if (rt && !lifetime) {
828                 ip6_del_rt(net, rt);
829                 rt = NULL;
830         }
831
832         if (!rt && lifetime)
833                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
834                                         dev, pref);
835         else if (rt)
836                 rt->fib6_flags = RTF_ROUTEINFO |
837                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
838
839         if (rt) {
840                 if (!addrconf_finite_timeout(lifetime))
841                         fib6_clean_expires(rt);
842                 else
843                         fib6_set_expires(rt, jiffies + HZ * lifetime);
844
845                 fib6_info_release(rt);
846         }
847         return 0;
848 }
849 #endif
850
851 /*
852  *      Misc support functions
853  */
854
855 /* called with rcu_lock held */
856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
857 {
858         struct net_device *dev = rt->fib6_nh.nh_dev;
859
860         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861                 /* for copies of local routes, dst->dev needs to be the
862                  * device if it is a master device, the master device if
863                  * device is enslaved, and the loopback as the default
864                  */
865                 if (netif_is_l3_slave(dev) &&
866                     !rt6_need_strict(&rt->fib6_dst.addr))
867                         dev = l3mdev_master_dev_rcu(dev);
868                 else if (!netif_is_l3_master(dev))
869                         dev = dev_net(dev)->loopback_dev;
870                 /* last case is netif_is_l3_master(dev) is true in which
871                  * case we want dev returned to be dev
872                  */
873         }
874
875         return dev;
876 }
877
878 static const int fib6_prop[RTN_MAX + 1] = {
879         [RTN_UNSPEC]    = 0,
880         [RTN_UNICAST]   = 0,
881         [RTN_LOCAL]     = 0,
882         [RTN_BROADCAST] = 0,
883         [RTN_ANYCAST]   = 0,
884         [RTN_MULTICAST] = 0,
885         [RTN_BLACKHOLE] = -EINVAL,
886         [RTN_UNREACHABLE] = -EHOSTUNREACH,
887         [RTN_PROHIBIT]  = -EACCES,
888         [RTN_THROW]     = -EAGAIN,
889         [RTN_NAT]       = -EINVAL,
890         [RTN_XRESOLVE]  = -EINVAL,
891 };
892
893 static int ip6_rt_type_to_error(u8 fib6_type)
894 {
895         return fib6_prop[fib6_type];
896 }
897
898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
899 {
900         unsigned short flags = 0;
901
902         if (rt->dst_nocount)
903                 flags |= DST_NOCOUNT;
904         if (rt->dst_nopolicy)
905                 flags |= DST_NOPOLICY;
906         if (rt->dst_host)
907                 flags |= DST_HOST;
908
909         return flags;
910 }
911
912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
913 {
914         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
915
916         switch (ort->fib6_type) {
917         case RTN_BLACKHOLE:
918                 rt->dst.output = dst_discard_out;
919                 rt->dst.input = dst_discard;
920                 break;
921         case RTN_PROHIBIT:
922                 rt->dst.output = ip6_pkt_prohibit_out;
923                 rt->dst.input = ip6_pkt_prohibit;
924                 break;
925         case RTN_THROW:
926         case RTN_UNREACHABLE:
927         default:
928                 rt->dst.output = ip6_pkt_discard_out;
929                 rt->dst.input = ip6_pkt_discard;
930                 break;
931         }
932 }
933
934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
935 {
936         if (ort->fib6_flags & RTF_REJECT) {
937                 ip6_rt_init_dst_reject(rt, ort);
938                 return;
939         }
940
941         rt->dst.error = 0;
942         rt->dst.output = ip6_output;
943
944         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
945                 rt->dst.input = ip6_input;
946         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
947                 rt->dst.input = ip6_mc_input;
948         } else {
949                 rt->dst.input = ip6_forward;
950         }
951
952         if (ort->fib6_nh.nh_lwtstate) {
953                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
954                 lwtunnel_set_redirect(&rt->dst);
955         }
956
957         rt->dst.lastuse = jiffies;
958 }
959
960 /* Caller must already hold reference to @from */
961 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
962 {
963         rt->rt6i_flags &= ~RTF_EXPIRES;
964         rcu_assign_pointer(rt->from, from);
965         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
966 }
967
968 /* Caller must already hold reference to @ort */
969 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
970 {
971         struct net_device *dev = fib6_info_nh_dev(ort);
972
973         ip6_rt_init_dst(rt, ort);
974
975         rt->rt6i_dst = ort->fib6_dst;
976         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
977         rt->rt6i_flags = ort->fib6_flags;
978         if (ort->fib6_nh.fib_nh_has_gw) {
979                 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
980                 rt->rt6i_flags |= RTF_GATEWAY;
981         }
982         rt6_set_from(rt, ort);
983 #ifdef CONFIG_IPV6_SUBTREES
984         rt->rt6i_src = ort->fib6_src;
985 #endif
986 }
987
988 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
989                                         struct in6_addr *saddr)
990 {
991         struct fib6_node *pn, *sn;
992         while (1) {
993                 if (fn->fn_flags & RTN_TL_ROOT)
994                         return NULL;
995                 pn = rcu_dereference(fn->parent);
996                 sn = FIB6_SUBTREE(pn);
997                 if (sn && sn != fn)
998                         fn = fib6_node_lookup(sn, NULL, saddr);
999                 else
1000                         fn = pn;
1001                 if (fn->fn_flags & RTN_RTINFO)
1002                         return fn;
1003         }
1004 }
1005
1006 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1007 {
1008         struct rt6_info *rt = *prt;
1009
1010         if (dst_hold_safe(&rt->dst))
1011                 return true;
1012         if (net) {
1013                 rt = net->ipv6.ip6_null_entry;
1014                 dst_hold(&rt->dst);
1015         } else {
1016                 rt = NULL;
1017         }
1018         *prt = rt;
1019         return false;
1020 }
1021
1022 /* called with rcu_lock held */
1023 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1024 {
1025         unsigned short flags = fib6_info_dst_flags(rt);
1026         struct net_device *dev = rt->fib6_nh.nh_dev;
1027         struct rt6_info *nrt;
1028
1029         if (!fib6_info_hold_safe(rt))
1030                 goto fallback;
1031
1032         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1033         if (!nrt) {
1034                 fib6_info_release(rt);
1035                 goto fallback;
1036         }
1037
1038         ip6_rt_copy_init(nrt, rt);
1039         return nrt;
1040
1041 fallback:
1042         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1043         dst_hold(&nrt->dst);
1044         return nrt;
1045 }
1046
1047 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1048                                              struct fib6_table *table,
1049                                              struct flowi6 *fl6,
1050                                              const struct sk_buff *skb,
1051                                              int flags)
1052 {
1053         struct fib6_info *f6i;
1054         struct fib6_node *fn;
1055         struct rt6_info *rt;
1056
1057         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1058                 flags &= ~RT6_LOOKUP_F_IFACE;
1059
1060         rcu_read_lock();
1061         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1062 restart:
1063         f6i = rcu_dereference(fn->leaf);
1064         if (!f6i) {
1065                 f6i = net->ipv6.fib6_null_entry;
1066         } else {
1067                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1068                                       fl6->flowi6_oif, flags);
1069                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1070                         f6i = fib6_multipath_select(net, f6i, fl6,
1071                                                     fl6->flowi6_oif, skb,
1072                                                     flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         trace_fib6_table_lookup(net, f6i, table, fl6);
1081
1082         /* Search through exception table */
1083         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1084         if (rt) {
1085                 if (ip6_hold_safe(net, &rt))
1086                         dst_use_noref(&rt->dst, jiffies);
1087         } else if (f6i == net->ipv6.fib6_null_entry) {
1088                 rt = net->ipv6.ip6_null_entry;
1089                 dst_hold(&rt->dst);
1090         } else {
1091                 rt = ip6_create_rt_rcu(f6i);
1092         }
1093
1094         rcu_read_unlock();
1095
1096         return rt;
1097 }
1098
1099 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1100                                    const struct sk_buff *skb, int flags)
1101 {
1102         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1105
1106 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1107                             const struct in6_addr *saddr, int oif,
1108                             const struct sk_buff *skb, int strict)
1109 {
1110         struct flowi6 fl6 = {
1111                 .flowi6_oif = oif,
1112                 .daddr = *daddr,
1113         };
1114         struct dst_entry *dst;
1115         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1116
1117         if (saddr) {
1118                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1119                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1120         }
1121
1122         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1123         if (dst->error == 0)
1124                 return (struct rt6_info *) dst;
1125
1126         dst_release(dst);
1127
1128         return NULL;
1129 }
1130 EXPORT_SYMBOL(rt6_lookup);
1131
1132 /* ip6_ins_rt is called with FREE table->tb6_lock.
1133  * It takes new route entry, the addition fails by any reason the
1134  * route is released.
1135  * Caller must hold dst before calling it.
1136  */
1137
1138 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1139                         struct netlink_ext_ack *extack)
1140 {
1141         int err;
1142         struct fib6_table *table;
1143
1144         table = rt->fib6_table;
1145         spin_lock_bh(&table->tb6_lock);
1146         err = fib6_add(&table->tb6_root, rt, info, extack);
1147         spin_unlock_bh(&table->tb6_lock);
1148
1149         return err;
1150 }
1151
1152 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1153 {
1154         struct nl_info info = { .nl_net = net, };
1155
1156         return __ip6_ins_rt(rt, &info, NULL);
1157 }
1158
1159 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1160                                            const struct in6_addr *daddr,
1161                                            const struct in6_addr *saddr)
1162 {
1163         struct net_device *dev;
1164         struct rt6_info *rt;
1165
1166         /*
1167          *      Clone the route.
1168          */
1169
1170         if (!fib6_info_hold_safe(ort))
1171                 return NULL;
1172
1173         dev = ip6_rt_get_dev_rcu(ort);
1174         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1175         if (!rt) {
1176                 fib6_info_release(ort);
1177                 return NULL;
1178         }
1179
1180         ip6_rt_copy_init(rt, ort);
1181         rt->rt6i_flags |= RTF_CACHE;
1182         rt->dst.flags |= DST_HOST;
1183         rt->rt6i_dst.addr = *daddr;
1184         rt->rt6i_dst.plen = 128;
1185
1186         if (!rt6_is_gw_or_nonexthop(ort)) {
1187                 if (ort->fib6_dst.plen != 128 &&
1188                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189                         rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191                 if (rt->rt6i_src.plen && saddr) {
1192                         rt->rt6i_src.addr = *saddr;
1193                         rt->rt6i_src.plen = 128;
1194                 }
1195 #endif
1196         }
1197
1198         return rt;
1199 }
1200
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 {
1203         unsigned short flags = fib6_info_dst_flags(rt);
1204         struct net_device *dev;
1205         struct rt6_info *pcpu_rt;
1206
1207         if (!fib6_info_hold_safe(rt))
1208                 return NULL;
1209
1210         rcu_read_lock();
1211         dev = ip6_rt_get_dev_rcu(rt);
1212         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1213         rcu_read_unlock();
1214         if (!pcpu_rt) {
1215                 fib6_info_release(rt);
1216                 return NULL;
1217         }
1218         ip6_rt_copy_init(pcpu_rt, rt);
1219         pcpu_rt->rt6i_flags |= RTF_PCPU;
1220         return pcpu_rt;
1221 }
1222
1223 /* It should be called with rcu_read_lock() acquired */
1224 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1225 {
1226         struct rt6_info *pcpu_rt, **p;
1227
1228         p = this_cpu_ptr(rt->rt6i_pcpu);
1229         pcpu_rt = *p;
1230
1231         if (pcpu_rt)
1232                 ip6_hold_safe(NULL, &pcpu_rt);
1233
1234         return pcpu_rt;
1235 }
1236
1237 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1238                                             struct fib6_info *rt)
1239 {
1240         struct rt6_info *pcpu_rt, *prev, **p;
1241
1242         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1243         if (!pcpu_rt) {
1244                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1245                 return net->ipv6.ip6_null_entry;
1246         }
1247
1248         dst_hold(&pcpu_rt->dst);
1249         p = this_cpu_ptr(rt->rt6i_pcpu);
1250         prev = cmpxchg(p, NULL, pcpu_rt);
1251         BUG_ON(prev);
1252
1253         return pcpu_rt;
1254 }
1255
1256 /* exception hash table implementation
1257  */
1258 static DEFINE_SPINLOCK(rt6_exception_lock);
1259
1260 /* Remove rt6_ex from hash table and free the memory
1261  * Caller must hold rt6_exception_lock
1262  */
1263 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1264                                  struct rt6_exception *rt6_ex)
1265 {
1266         struct fib6_info *from;
1267         struct net *net;
1268
1269         if (!bucket || !rt6_ex)
1270                 return;
1271
1272         net = dev_net(rt6_ex->rt6i->dst.dev);
1273         net->ipv6.rt6_stats->fib_rt_cache--;
1274
1275         /* purge completely the exception to allow releasing the held resources:
1276          * some [sk] cache may keep the dst around for unlimited time
1277          */
1278         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1279                                          lockdep_is_held(&rt6_exception_lock));
1280         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1281         fib6_info_release(from);
1282         dst_dev_put(&rt6_ex->rt6i->dst);
1283
1284         hlist_del_rcu(&rt6_ex->hlist);
1285         dst_release(&rt6_ex->rt6i->dst);
1286         kfree_rcu(rt6_ex, rcu);
1287         WARN_ON_ONCE(!bucket->depth);
1288         bucket->depth--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct in6_addr *src_key = NULL;
1603         struct rt6_exception *rt6_ex;
1604         struct fib6_info *from;
1605
1606         rcu_read_lock();
1607         from = rcu_dereference(rt->from);
1608         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1609                 goto unlock;
1610
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629 unlock:
1630         rcu_read_unlock();
1631 }
1632
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634                                          struct rt6_info *rt, int mtu)
1635 {
1636         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637          * lowest MTU in the path: always allow updating the route PMTU to
1638          * reflect PMTU decreases.
1639          *
1640          * If the new MTU is higher, and the route PMTU is equal to the local
1641          * MTU, this means the old MTU is the lowest in the path, so allow
1642          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1643          * handle this.
1644          */
1645
1646         if (dst_mtu(&rt->dst) >= mtu)
1647                 return true;
1648
1649         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1650                 return true;
1651
1652         return false;
1653 }
1654
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656                                        struct fib6_info *rt, int mtu)
1657 {
1658         struct rt6_exception_bucket *bucket;
1659         struct rt6_exception *rt6_ex;
1660         int i;
1661
1662         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663                                         lockdep_is_held(&rt6_exception_lock));
1664
1665         if (!bucket)
1666                 return;
1667
1668         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670                         struct rt6_info *entry = rt6_ex->rt6i;
1671
1672                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673                          * route), the metrics of its rt->from have already
1674                          * been updated.
1675                          */
1676                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1678                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1679                 }
1680                 bucket++;
1681         }
1682 }
1683
1684 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1685
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687                                         struct in6_addr *gateway)
1688 {
1689         struct rt6_exception_bucket *bucket;
1690         struct rt6_exception *rt6_ex;
1691         struct hlist_node *tmp;
1692         int i;
1693
1694         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1695                 return;
1696
1697         spin_lock_bh(&rt6_exception_lock);
1698         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699                                      lockdep_is_held(&rt6_exception_lock));
1700
1701         if (bucket) {
1702                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703                         hlist_for_each_entry_safe(rt6_ex, tmp,
1704                                                   &bucket->chain, hlist) {
1705                                 struct rt6_info *entry = rt6_ex->rt6i;
1706
1707                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708                                     RTF_CACHE_GATEWAY &&
1709                                     ipv6_addr_equal(gateway,
1710                                                     &entry->rt6i_gateway)) {
1711                                         rt6_remove_exception(bucket, rt6_ex);
1712                                 }
1713                         }
1714                         bucket++;
1715                 }
1716         }
1717
1718         spin_unlock_bh(&rt6_exception_lock);
1719 }
1720
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722                                       struct rt6_exception *rt6_ex,
1723                                       struct fib6_gc_args *gc_args,
1724                                       unsigned long now)
1725 {
1726         struct rt6_info *rt = rt6_ex->rt6i;
1727
1728         /* we are pruning and obsoleting aged-out and non gateway exceptions
1729          * even if others have still references to them, so that on next
1730          * dst_check() such references can be dropped.
1731          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732          * expired, independently from their aging, as per RFC 8201 section 4
1733          */
1734         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736                         RT6_TRACE("aging clone %p\n", rt);
1737                         rt6_remove_exception(bucket, rt6_ex);
1738                         return;
1739                 }
1740         } else if (time_after(jiffies, rt->dst.expires)) {
1741                 RT6_TRACE("purging expired route %p\n", rt);
1742                 rt6_remove_exception(bucket, rt6_ex);
1743                 return;
1744         }
1745
1746         if (rt->rt6i_flags & RTF_GATEWAY) {
1747                 struct neighbour *neigh;
1748                 __u8 neigh_flags = 0;
1749
1750                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1751                 if (neigh)
1752                         neigh_flags = neigh->flags;
1753
1754                 if (!(neigh_flags & NTF_ROUTER)) {
1755                         RT6_TRACE("purging route %p via non-router but gateway\n",
1756                                   rt);
1757                         rt6_remove_exception(bucket, rt6_ex);
1758                         return;
1759                 }
1760         }
1761
1762         gc_args->more++;
1763 }
1764
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766                         struct fib6_gc_args *gc_args,
1767                         unsigned long now)
1768 {
1769         struct rt6_exception_bucket *bucket;
1770         struct rt6_exception *rt6_ex;
1771         struct hlist_node *tmp;
1772         int i;
1773
1774         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1775                 return;
1776
1777         rcu_read_lock_bh();
1778         spin_lock(&rt6_exception_lock);
1779         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780                                     lockdep_is_held(&rt6_exception_lock));
1781
1782         if (bucket) {
1783                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784                         hlist_for_each_entry_safe(rt6_ex, tmp,
1785                                                   &bucket->chain, hlist) {
1786                                 rt6_age_examine_exception(bucket, rt6_ex,
1787                                                           gc_args, now);
1788                         }
1789                         bucket++;
1790                 }
1791         }
1792         spin_unlock(&rt6_exception_lock);
1793         rcu_read_unlock_bh();
1794 }
1795
1796 /* must be called with rcu lock held */
1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1798                                     int oif, struct flowi6 *fl6, int strict)
1799 {
1800         struct fib6_node *fn, *saved_fn;
1801         struct fib6_info *f6i;
1802
1803         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1804         saved_fn = fn;
1805
1806         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1807                 oif = 0;
1808
1809 redo_rt6_select:
1810         f6i = rt6_select(net, fn, oif, strict);
1811         if (f6i == net->ipv6.fib6_null_entry) {
1812                 fn = fib6_backtrack(fn, &fl6->saddr);
1813                 if (fn)
1814                         goto redo_rt6_select;
1815                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1816                         /* also consider unreachable route */
1817                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1818                         fn = saved_fn;
1819                         goto redo_rt6_select;
1820                 }
1821         }
1822
1823         trace_fib6_table_lookup(net, f6i, table, fl6);
1824
1825         return f6i;
1826 }
1827
1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1829                                int oif, struct flowi6 *fl6,
1830                                const struct sk_buff *skb, int flags)
1831 {
1832         struct fib6_info *f6i;
1833         struct rt6_info *rt;
1834         int strict = 0;
1835
1836         strict |= flags & RT6_LOOKUP_F_IFACE;
1837         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1838         if (net->ipv6.devconf_all->forwarding == 0)
1839                 strict |= RT6_LOOKUP_F_REACHABLE;
1840
1841         rcu_read_lock();
1842
1843         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1844         if (f6i->fib6_nsiblings)
1845                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1846
1847         if (f6i == net->ipv6.fib6_null_entry) {
1848                 rt = net->ipv6.ip6_null_entry;
1849                 rcu_read_unlock();
1850                 dst_hold(&rt->dst);
1851                 return rt;
1852         }
1853
1854         /*Search through exception table */
1855         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1856         if (rt) {
1857                 if (ip6_hold_safe(net, &rt))
1858                         dst_use_noref(&rt->dst, jiffies);
1859
1860                 rcu_read_unlock();
1861                 return rt;
1862         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863                             !f6i->fib6_nh.fib_nh_has_gw)) {
1864                 /* Create a RTF_CACHE clone which will not be
1865                  * owned by the fib6 tree.  It is for the special case where
1866                  * the daddr in the skb during the neighbor look-up is different
1867                  * from the fl6->daddr used to look-up route here.
1868                  */
1869                 struct rt6_info *uncached_rt;
1870
1871                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1872
1873                 rcu_read_unlock();
1874
1875                 if (uncached_rt) {
1876                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1877                          * No need for another dst_hold()
1878                          */
1879                         rt6_uncached_list_add(uncached_rt);
1880                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1881                 } else {
1882                         uncached_rt = net->ipv6.ip6_null_entry;
1883                         dst_hold(&uncached_rt->dst);
1884                 }
1885
1886                 return uncached_rt;
1887         } else {
1888                 /* Get a percpu copy */
1889
1890                 struct rt6_info *pcpu_rt;
1891
1892                 local_bh_disable();
1893                 pcpu_rt = rt6_get_pcpu_route(f6i);
1894
1895                 if (!pcpu_rt)
1896                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1897
1898                 local_bh_enable();
1899                 rcu_read_unlock();
1900
1901                 return pcpu_rt;
1902         }
1903 }
1904 EXPORT_SYMBOL_GPL(ip6_pol_route);
1905
1906 static struct rt6_info *ip6_pol_route_input(struct net *net,
1907                                             struct fib6_table *table,
1908                                             struct flowi6 *fl6,
1909                                             const struct sk_buff *skb,
1910                                             int flags)
1911 {
1912         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1913 }
1914
1915 struct dst_entry *ip6_route_input_lookup(struct net *net,
1916                                          struct net_device *dev,
1917                                          struct flowi6 *fl6,
1918                                          const struct sk_buff *skb,
1919                                          int flags)
1920 {
1921         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1922                 flags |= RT6_LOOKUP_F_IFACE;
1923
1924         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1927
1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929                                   struct flow_keys *keys,
1930                                   struct flow_keys *flkeys)
1931 {
1932         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1933         const struct ipv6hdr *key_iph = outer_iph;
1934         struct flow_keys *_flkeys = flkeys;
1935         const struct ipv6hdr *inner_iph;
1936         const struct icmp6hdr *icmph;
1937         struct ipv6hdr _inner_iph;
1938         struct icmp6hdr _icmph;
1939
1940         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1941                 goto out;
1942
1943         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1944                                    sizeof(_icmph), &_icmph);
1945         if (!icmph)
1946                 goto out;
1947
1948         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1949             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1950             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1951             icmph->icmp6_type != ICMPV6_PARAMPROB)
1952                 goto out;
1953
1954         inner_iph = skb_header_pointer(skb,
1955                                        skb_transport_offset(skb) + sizeof(*icmph),
1956                                        sizeof(_inner_iph), &_inner_iph);
1957         if (!inner_iph)
1958                 goto out;
1959
1960         key_iph = inner_iph;
1961         _flkeys = NULL;
1962 out:
1963         if (_flkeys) {
1964                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1965                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1966                 keys->tags.flow_label = _flkeys->tags.flow_label;
1967                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1968         } else {
1969                 keys->addrs.v6addrs.src = key_iph->saddr;
1970                 keys->addrs.v6addrs.dst = key_iph->daddr;
1971                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1972                 keys->basic.ip_proto = key_iph->nexthdr;
1973         }
1974 }
1975
1976 /* if skb is set it will be used and fl6 can be NULL */
1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1978                        const struct sk_buff *skb, struct flow_keys *flkeys)
1979 {
1980         struct flow_keys hash_keys;
1981         u32 mhash;
1982
1983         switch (ip6_multipath_hash_policy(net)) {
1984         case 0:
1985                 memset(&hash_keys, 0, sizeof(hash_keys));
1986                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987                 if (skb) {
1988                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1989                 } else {
1990                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1991                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1994                 }
1995                 break;
1996         case 1:
1997                 if (skb) {
1998                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1999                         struct flow_keys keys;
2000
2001                         /* short-circuit if we already have L4 hash present */
2002                         if (skb->l4_hash)
2003                                 return skb_get_hash_raw(skb) >> 1;
2004
2005                         memset(&hash_keys, 0, sizeof(hash_keys));
2006
2007                         if (!flkeys) {
2008                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2009                                 flkeys = &keys;
2010                         }
2011                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2013                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2014                         hash_keys.ports.src = flkeys->ports.src;
2015                         hash_keys.ports.dst = flkeys->ports.dst;
2016                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2017                 } else {
2018                         memset(&hash_keys, 0, sizeof(hash_keys));
2019                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2021                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2022                         hash_keys.ports.src = fl6->fl6_sport;
2023                         hash_keys.ports.dst = fl6->fl6_dport;
2024                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2025                 }
2026                 break;
2027         }
2028         mhash = flow_hash_from_keys(&hash_keys);
2029
2030         return mhash >> 1;
2031 }
2032
2033 void ip6_route_input(struct sk_buff *skb)
2034 {
2035         const struct ipv6hdr *iph = ipv6_hdr(skb);
2036         struct net *net = dev_net(skb->dev);
2037         int flags = RT6_LOOKUP_F_HAS_SADDR;
2038         struct ip_tunnel_info *tun_info;
2039         struct flowi6 fl6 = {
2040                 .flowi6_iif = skb->dev->ifindex,
2041                 .daddr = iph->daddr,
2042                 .saddr = iph->saddr,
2043                 .flowlabel = ip6_flowinfo(iph),
2044                 .flowi6_mark = skb->mark,
2045                 .flowi6_proto = iph->nexthdr,
2046         };
2047         struct flow_keys *flkeys = NULL, _flkeys;
2048
2049         tun_info = skb_tunnel_info(skb);
2050         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2052
2053         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2054                 flkeys = &_flkeys;
2055
2056         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2058         skb_dst_drop(skb);
2059         skb_dst_set(skb,
2060                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2061 }
2062
2063 static struct rt6_info *ip6_pol_route_output(struct net *net,
2064                                              struct fib6_table *table,
2065                                              struct flowi6 *fl6,
2066                                              const struct sk_buff *skb,
2067                                              int flags)
2068 {
2069         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2070 }
2071
2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2073                                          struct flowi6 *fl6, int flags)
2074 {
2075         bool any_src;
2076
2077         if (ipv6_addr_type(&fl6->daddr) &
2078             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079                 struct dst_entry *dst;
2080
2081                 dst = l3mdev_link_scope_lookup(net, fl6);
2082                 if (dst)
2083                         return dst;
2084         }
2085
2086         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2087
2088         any_src = ipv6_addr_any(&fl6->saddr);
2089         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090             (fl6->flowi6_oif && any_src))
2091                 flags |= RT6_LOOKUP_F_IFACE;
2092
2093         if (!any_src)
2094                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2095         else if (sk)
2096                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2097
2098         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2099 }
2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2101
2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2103 {
2104         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105         struct net_device *loopback_dev = net->loopback_dev;
2106         struct dst_entry *new = NULL;
2107
2108         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109                        DST_OBSOLETE_DEAD, 0);
2110         if (rt) {
2111                 rt6_info_init(rt);
2112                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2113
2114                 new = &rt->dst;
2115                 new->__use = 1;
2116                 new->input = dst_discard;
2117                 new->output = dst_discard_out;
2118
2119                 dst_copy_metrics(new, &ort->dst);
2120
2121                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2122                 rt->rt6i_gateway = ort->rt6i_gateway;
2123                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2124
2125                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2126 #ifdef CONFIG_IPV6_SUBTREES
2127                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2128 #endif
2129         }
2130
2131         dst_release(dst_orig);
2132         return new ? new : ERR_PTR(-ENOMEM);
2133 }
2134
2135 /*
2136  *      Destination cache support functions
2137  */
2138
2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2140 {
2141         u32 rt_cookie = 0;
2142
2143         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2144                 return false;
2145
2146         if (fib6_check_expired(f6i))
2147                 return false;
2148
2149         return true;
2150 }
2151
2152 static struct dst_entry *rt6_check(struct rt6_info *rt,
2153                                    struct fib6_info *from,
2154                                    u32 cookie)
2155 {
2156         u32 rt_cookie = 0;
2157
2158         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159             rt_cookie != cookie)
2160                 return NULL;
2161
2162         if (rt6_check_expired(rt))
2163                 return NULL;
2164
2165         return &rt->dst;
2166 }
2167
2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2169                                             struct fib6_info *from,
2170                                             u32 cookie)
2171 {
2172         if (!__rt6_check_expired(rt) &&
2173             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174             fib6_check(from, cookie))
2175                 return &rt->dst;
2176         else
2177                 return NULL;
2178 }
2179
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2181 {
2182         struct dst_entry *dst_ret;
2183         struct fib6_info *from;
2184         struct rt6_info *rt;
2185
2186         rt = container_of(dst, struct rt6_info, dst);
2187
2188         rcu_read_lock();
2189
2190         /* All IPV6 dsts are created with ->obsolete set to the value
2191          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2192          * into this function always.
2193          */
2194
2195         from = rcu_dereference(rt->from);
2196
2197         if (from && (rt->rt6i_flags & RTF_PCPU ||
2198             unlikely(!list_empty(&rt->rt6i_uncached))))
2199                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2200         else
2201                 dst_ret = rt6_check(rt, from, cookie);
2202
2203         rcu_read_unlock();
2204
2205         return dst_ret;
2206 }
2207
2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2209 {
2210         struct rt6_info *rt = (struct rt6_info *) dst;
2211
2212         if (rt) {
2213                 if (rt->rt6i_flags & RTF_CACHE) {
2214                         rcu_read_lock();
2215                         if (rt6_check_expired(rt)) {
2216                                 rt6_remove_exception_rt(rt);
2217                                 dst = NULL;
2218                         }
2219                         rcu_read_unlock();
2220                 } else {
2221                         dst_release(dst);
2222                         dst = NULL;
2223                 }
2224         }
2225         return dst;
2226 }
2227
2228 static void ip6_link_failure(struct sk_buff *skb)
2229 {
2230         struct rt6_info *rt;
2231
2232         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2233
2234         rt = (struct rt6_info *) skb_dst(skb);
2235         if (rt) {
2236                 rcu_read_lock();
2237                 if (rt->rt6i_flags & RTF_CACHE) {
2238                         rt6_remove_exception_rt(rt);
2239                 } else {
2240                         struct fib6_info *from;
2241                         struct fib6_node *fn;
2242
2243                         from = rcu_dereference(rt->from);
2244                         if (from) {
2245                                 fn = rcu_dereference(from->fib6_node);
2246                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2247                                         fn->fn_sernum = -1;
2248                         }
2249                 }
2250                 rcu_read_unlock();
2251         }
2252 }
2253
2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2255 {
2256         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2257                 struct fib6_info *from;
2258
2259                 rcu_read_lock();
2260                 from = rcu_dereference(rt0->from);
2261                 if (from)
2262                         rt0->dst.expires = from->expires;
2263                 rcu_read_unlock();
2264         }
2265
2266         dst_set_expires(&rt0->dst, timeout);
2267         rt0->rt6i_flags |= RTF_EXPIRES;
2268 }
2269
2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2271 {
2272         struct net *net = dev_net(rt->dst.dev);
2273
2274         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275         rt->rt6i_flags |= RTF_MODIFIED;
2276         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2277 }
2278
2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2280 {
2281         return !(rt->rt6i_flags & RTF_CACHE) &&
2282                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2283 }
2284
2285 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2286                                  const struct ipv6hdr *iph, u32 mtu)
2287 {
2288         const struct in6_addr *daddr, *saddr;
2289         struct rt6_info *rt6 = (struct rt6_info *)dst;
2290
2291         if (dst_metric_locked(dst, RTAX_MTU))
2292                 return;
2293
2294         if (iph) {
2295                 daddr = &iph->daddr;
2296                 saddr = &iph->saddr;
2297         } else if (sk) {
2298                 daddr = &sk->sk_v6_daddr;
2299                 saddr = &inet6_sk(sk)->saddr;
2300         } else {
2301                 daddr = NULL;
2302                 saddr = NULL;
2303         }
2304         dst_confirm_neigh(dst, daddr);
2305         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2306         if (mtu >= dst_mtu(dst))
2307                 return;
2308
2309         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310                 rt6_do_update_pmtu(rt6, mtu);
2311                 /* update rt6_ex->stamp for cache */
2312                 if (rt6->rt6i_flags & RTF_CACHE)
2313                         rt6_update_exception_stamp_rt(rt6);
2314         } else if (daddr) {
2315                 struct fib6_info *from;
2316                 struct rt6_info *nrt6;
2317
2318                 rcu_read_lock();
2319                 from = rcu_dereference(rt6->from);
2320                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2321                 if (nrt6) {
2322                         rt6_do_update_pmtu(nrt6, mtu);
2323                         if (rt6_insert_exception(nrt6, from))
2324                                 dst_release_immediate(&nrt6->dst);
2325                 }
2326                 rcu_read_unlock();
2327         }
2328 }
2329
2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2331                                struct sk_buff *skb, u32 mtu)
2332 {
2333         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2334 }
2335
2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337                      int oif, u32 mark, kuid_t uid)
2338 {
2339         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2340         struct dst_entry *dst;
2341         struct flowi6 fl6 = {
2342                 .flowi6_oif = oif,
2343                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2344                 .daddr = iph->daddr,
2345                 .saddr = iph->saddr,
2346                 .flowlabel = ip6_flowinfo(iph),
2347                 .flowi6_uid = uid,
2348         };
2349
2350         dst = ip6_route_output(net, NULL, &fl6);
2351         if (!dst->error)
2352                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2353         dst_release(dst);
2354 }
2355 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2356
2357 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2358 {
2359         int oif = sk->sk_bound_dev_if;
2360         struct dst_entry *dst;
2361
2362         if (!oif && skb->dev)
2363                 oif = l3mdev_master_ifindex(skb->dev);
2364
2365         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2366
2367         dst = __sk_dst_get(sk);
2368         if (!dst || !dst->obsolete ||
2369             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2370                 return;
2371
2372         bh_lock_sock(sk);
2373         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374                 ip6_datagram_dst_update(sk, false);
2375         bh_unlock_sock(sk);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2378
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380                            const struct flowi6 *fl6)
2381 {
2382 #ifdef CONFIG_IPV6_SUBTREES
2383         struct ipv6_pinfo *np = inet6_sk(sk);
2384 #endif
2385
2386         ip6_dst_store(sk, dst,
2387                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388                       &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2391                       &np->saddr :
2392 #endif
2393                       NULL);
2394 }
2395
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2398         struct flowi6 fl6;
2399         struct in6_addr gateway;
2400 };
2401
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403                                              struct fib6_table *table,
2404                                              struct flowi6 *fl6,
2405                                              const struct sk_buff *skb,
2406                                              int flags)
2407 {
2408         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409         struct rt6_info *ret = NULL, *rt_cache;
2410         struct fib6_info *rt;
2411         struct fib6_node *fn;
2412
2413         /* Get the "current" route for this destination and
2414          * check if the redirect has come from appropriate router.
2415          *
2416          * RFC 4861 specifies that redirects should only be
2417          * accepted if they come from the nexthop to the target.
2418          * Due to the way the routes are chosen, this notion
2419          * is a bit fuzzy and one might need to check all possible
2420          * routes.
2421          */
2422
2423         rcu_read_lock();
2424         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2425 restart:
2426         for_each_fib6_node_rt_rcu(fn) {
2427                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2428                         continue;
2429                 if (fib6_check_expired(rt))
2430                         continue;
2431                 if (rt->fib6_flags & RTF_REJECT)
2432                         break;
2433                 if (!rt->fib6_nh.fib_nh_has_gw)
2434                         continue;
2435                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2436                         continue;
2437                 /* rt_cache's gateway might be different from its 'parent'
2438                  * in the case of an ip redirect.
2439                  * So we keep searching in the exception table if the gateway
2440                  * is different.
2441                  */
2442                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2443                         rt_cache = rt6_find_cached_rt(rt,
2444                                                       &fl6->daddr,
2445                                                       &fl6->saddr);
2446                         if (rt_cache &&
2447                             ipv6_addr_equal(&rdfl->gateway,
2448                                             &rt_cache->rt6i_gateway)) {
2449                                 ret = rt_cache;
2450                                 break;
2451                         }
2452                         continue;
2453                 }
2454                 break;
2455         }
2456
2457         if (!rt)
2458                 rt = net->ipv6.fib6_null_entry;
2459         else if (rt->fib6_flags & RTF_REJECT) {
2460                 ret = net->ipv6.ip6_null_entry;
2461                 goto out;
2462         }
2463
2464         if (rt == net->ipv6.fib6_null_entry) {
2465                 fn = fib6_backtrack(fn, &fl6->saddr);
2466                 if (fn)
2467                         goto restart;
2468         }
2469
2470 out:
2471         if (ret)
2472                 ip6_hold_safe(net, &ret);
2473         else
2474                 ret = ip6_create_rt_rcu(rt);
2475
2476         rcu_read_unlock();
2477
2478         trace_fib6_table_lookup(net, rt, table, fl6);
2479         return ret;
2480 };
2481
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483                                             const struct flowi6 *fl6,
2484                                             const struct sk_buff *skb,
2485                                             const struct in6_addr *gateway)
2486 {
2487         int flags = RT6_LOOKUP_F_HAS_SADDR;
2488         struct ip6rd_flowi rdfl;
2489
2490         rdfl.fl6 = *fl6;
2491         rdfl.gateway = *gateway;
2492
2493         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494                                 flags, __ip6_route_redirect);
2495 }
2496
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2498                   kuid_t uid)
2499 {
2500         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501         struct dst_entry *dst;
2502         struct flowi6 fl6 = {
2503                 .flowi6_iif = LOOPBACK_IFINDEX,
2504                 .flowi6_oif = oif,
2505                 .flowi6_mark = mark,
2506                 .daddr = iph->daddr,
2507                 .saddr = iph->saddr,
2508                 .flowlabel = ip6_flowinfo(iph),
2509                 .flowi6_uid = uid,
2510         };
2511
2512         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2513         rt6_do_redirect(dst, NULL, skb);
2514         dst_release(dst);
2515 }
2516 EXPORT_SYMBOL_GPL(ip6_redirect);
2517
2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2519 {
2520         const struct ipv6hdr *iph = ipv6_hdr(skb);
2521         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2522         struct dst_entry *dst;
2523         struct flowi6 fl6 = {
2524                 .flowi6_iif = LOOPBACK_IFINDEX,
2525                 .flowi6_oif = oif,
2526                 .daddr = msg->dest,
2527                 .saddr = iph->daddr,
2528                 .flowi6_uid = sock_net_uid(net, NULL),
2529         };
2530
2531         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2532         rt6_do_redirect(dst, NULL, skb);
2533         dst_release(dst);
2534 }
2535
2536 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2537 {
2538         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2539                      sk->sk_uid);
2540 }
2541 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2542
2543 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2544 {
2545         struct net_device *dev = dst->dev;
2546         unsigned int mtu = dst_mtu(dst);
2547         struct net *net = dev_net(dev);
2548
2549         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2550
2551         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2552                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2553
2554         /*
2555          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2556          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2557          * IPV6_MAXPLEN is also valid and means: "any MSS,
2558          * rely only on pmtu discovery"
2559          */
2560         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2561                 mtu = IPV6_MAXPLEN;
2562         return mtu;
2563 }
2564
2565 static unsigned int ip6_mtu(const struct dst_entry *dst)
2566 {
2567         struct inet6_dev *idev;
2568         unsigned int mtu;
2569
2570         mtu = dst_metric_raw(dst, RTAX_MTU);
2571         if (mtu)
2572                 goto out;
2573
2574         mtu = IPV6_MIN_MTU;
2575
2576         rcu_read_lock();
2577         idev = __in6_dev_get(dst->dev);
2578         if (idev)
2579                 mtu = idev->cnf.mtu6;
2580         rcu_read_unlock();
2581
2582 out:
2583         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2584
2585         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2586 }
2587
2588 /* MTU selection:
2589  * 1. mtu on route is locked - use it
2590  * 2. mtu from nexthop exception
2591  * 3. mtu from egress device
2592  *
2593  * based on ip6_dst_mtu_forward and exception logic of
2594  * rt6_find_cached_rt; called with rcu_read_lock
2595  */
2596 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2597                       struct in6_addr *saddr)
2598 {
2599         struct rt6_exception_bucket *bucket;
2600         struct rt6_exception *rt6_ex;
2601         struct in6_addr *src_key;
2602         struct inet6_dev *idev;
2603         u32 mtu = 0;
2604
2605         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2606                 mtu = f6i->fib6_pmtu;
2607                 if (mtu)
2608                         goto out;
2609         }
2610
2611         src_key = NULL;
2612 #ifdef CONFIG_IPV6_SUBTREES
2613         if (f6i->fib6_src.plen)
2614                 src_key = saddr;
2615 #endif
2616
2617         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2618         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2619         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2620                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2621
2622         if (likely(!mtu)) {
2623                 struct net_device *dev = fib6_info_nh_dev(f6i);
2624
2625                 mtu = IPV6_MIN_MTU;
2626                 idev = __in6_dev_get(dev);
2627                 if (idev && idev->cnf.mtu6 > mtu)
2628                         mtu = idev->cnf.mtu6;
2629         }
2630
2631         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2632 out:
2633         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2634 }
2635
2636 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2637                                   struct flowi6 *fl6)
2638 {
2639         struct dst_entry *dst;
2640         struct rt6_info *rt;
2641         struct inet6_dev *idev = in6_dev_get(dev);
2642         struct net *net = dev_net(dev);
2643
2644         if (unlikely(!idev))
2645                 return ERR_PTR(-ENODEV);
2646
2647         rt = ip6_dst_alloc(net, dev, 0);
2648         if (unlikely(!rt)) {
2649                 in6_dev_put(idev);
2650                 dst = ERR_PTR(-ENOMEM);
2651                 goto out;
2652         }
2653
2654         rt->dst.flags |= DST_HOST;
2655         rt->dst.input = ip6_input;
2656         rt->dst.output  = ip6_output;
2657         rt->rt6i_gateway  = fl6->daddr;
2658         rt->rt6i_dst.addr = fl6->daddr;
2659         rt->rt6i_dst.plen = 128;
2660         rt->rt6i_idev     = idev;
2661         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2662
2663         /* Add this dst into uncached_list so that rt6_disable_ip() can
2664          * do proper release of the net_device
2665          */
2666         rt6_uncached_list_add(rt);
2667         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2668
2669         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2670
2671 out:
2672         return dst;
2673 }
2674
2675 static int ip6_dst_gc(struct dst_ops *ops)
2676 {
2677         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2678         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2679         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2680         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2681         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2682         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2683         int entries;
2684
2685         entries = dst_entries_get_fast(ops);
2686         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2687             entries <= rt_max_size)
2688                 goto out;
2689
2690         net->ipv6.ip6_rt_gc_expire++;
2691         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2692         entries = dst_entries_get_slow(ops);
2693         if (entries < ops->gc_thresh)
2694                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2695 out:
2696         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2697         return entries > rt_max_size;
2698 }
2699
2700 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2701                                             struct fib6_config *cfg,
2702                                             const struct in6_addr *gw_addr,
2703                                             u32 tbid, int flags)
2704 {
2705         struct flowi6 fl6 = {
2706                 .flowi6_oif = cfg->fc_ifindex,
2707                 .daddr = *gw_addr,
2708                 .saddr = cfg->fc_prefsrc,
2709         };
2710         struct fib6_table *table;
2711         struct rt6_info *rt;
2712
2713         table = fib6_get_table(net, tbid);
2714         if (!table)
2715                 return NULL;
2716
2717         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2718                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2719
2720         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2721         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2722
2723         /* if table lookup failed, fall back to full lookup */
2724         if (rt == net->ipv6.ip6_null_entry) {
2725                 ip6_rt_put(rt);
2726                 rt = NULL;
2727         }
2728
2729         return rt;
2730 }
2731
2732 static int ip6_route_check_nh_onlink(struct net *net,
2733                                      struct fib6_config *cfg,
2734                                      const struct net_device *dev,
2735                                      struct netlink_ext_ack *extack)
2736 {
2737         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2738         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2740         struct fib6_info *from;
2741         struct rt6_info *grt;
2742         int err;
2743
2744         err = 0;
2745         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2746         if (grt) {
2747                 rcu_read_lock();
2748                 from = rcu_dereference(grt->from);
2749                 if (!grt->dst.error &&
2750                     /* ignore match if it is the default route */
2751                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2752                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753                         NL_SET_ERR_MSG(extack,
2754                                        "Nexthop has invalid gateway or device mismatch");
2755                         err = -EINVAL;
2756                 }
2757                 rcu_read_unlock();
2758
2759                 ip6_rt_put(grt);
2760         }
2761
2762         return err;
2763 }
2764
2765 static int ip6_route_check_nh(struct net *net,
2766                               struct fib6_config *cfg,
2767                               struct net_device **_dev,
2768                               struct inet6_dev **idev)
2769 {
2770         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2771         struct net_device *dev = _dev ? *_dev : NULL;
2772         struct rt6_info *grt = NULL;
2773         int err = -EHOSTUNREACH;
2774
2775         if (cfg->fc_table) {
2776                 int flags = RT6_LOOKUP_F_IFACE;
2777
2778                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2779                                           cfg->fc_table, flags);
2780                 if (grt) {
2781                         if (grt->rt6i_flags & RTF_GATEWAY ||
2782                             (dev && dev != grt->dst.dev)) {
2783                                 ip6_rt_put(grt);
2784                                 grt = NULL;
2785                         }
2786                 }
2787         }
2788
2789         if (!grt)
2790                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2791
2792         if (!grt)
2793                 goto out;
2794
2795         if (dev) {
2796                 if (dev != grt->dst.dev) {
2797                         ip6_rt_put(grt);
2798                         goto out;
2799                 }
2800         } else {
2801                 *_dev = dev = grt->dst.dev;
2802                 *idev = grt->rt6i_idev;
2803                 dev_hold(dev);
2804                 in6_dev_hold(grt->rt6i_idev);
2805         }
2806
2807         if (!(grt->rt6i_flags & RTF_GATEWAY))
2808                 err = 0;
2809
2810         ip6_rt_put(grt);
2811
2812 out:
2813         return err;
2814 }
2815
2816 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2817                            struct net_device **_dev, struct inet6_dev **idev,
2818                            struct netlink_ext_ack *extack)
2819 {
2820         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2821         int gwa_type = ipv6_addr_type(gw_addr);
2822         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2823         const struct net_device *dev = *_dev;
2824         bool need_addr_check = !dev;
2825         int err = -EINVAL;
2826
2827         /* if gw_addr is local we will fail to detect this in case
2828          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2829          * will return already-added prefix route via interface that
2830          * prefix route was assigned to, which might be non-loopback.
2831          */
2832         if (dev &&
2833             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2834                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2835                 goto out;
2836         }
2837
2838         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2839                 /* IPv6 strictly inhibits using not link-local
2840                  * addresses as nexthop address.
2841                  * Otherwise, router will not able to send redirects.
2842                  * It is very good, but in some (rare!) circumstances
2843                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2844                  * some exceptions. --ANK
2845                  * We allow IPv4-mapped nexthops to support RFC4798-type
2846                  * addressing
2847                  */
2848                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2849                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2850                         goto out;
2851                 }
2852
2853                 if (cfg->fc_flags & RTNH_F_ONLINK)
2854                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2855                 else
2856                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2857
2858                 if (err)
2859                         goto out;
2860         }
2861
2862         /* reload in case device was changed */
2863         dev = *_dev;
2864
2865         err = -EINVAL;
2866         if (!dev) {
2867                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2868                 goto out;
2869         } else if (dev->flags & IFF_LOOPBACK) {
2870                 NL_SET_ERR_MSG(extack,
2871                                "Egress device can not be loopback device for this route");
2872                 goto out;
2873         }
2874
2875         /* if we did not check gw_addr above, do so now that the
2876          * egress device has been resolved.
2877          */
2878         if (need_addr_check &&
2879             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2880                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2881                 goto out;
2882         }
2883
2884         err = 0;
2885 out:
2886         return err;
2887 }
2888
2889 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2890 {
2891         if ((flags & RTF_REJECT) ||
2892             (dev && (dev->flags & IFF_LOOPBACK) &&
2893              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2894              !(flags & RTF_LOCAL)))
2895                 return true;
2896
2897         return false;
2898 }
2899
2900 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2901                  struct fib6_config *cfg, gfp_t gfp_flags,
2902                  struct netlink_ext_ack *extack)
2903 {
2904         struct net_device *dev = NULL;
2905         struct inet6_dev *idev = NULL;
2906         int addr_type;
2907         int err;
2908
2909         err = -ENODEV;
2910         if (cfg->fc_ifindex) {
2911                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2912                 if (!dev)
2913                         goto out;
2914                 idev = in6_dev_get(dev);
2915                 if (!idev)
2916                         goto out;
2917         }
2918
2919         if (cfg->fc_flags & RTNH_F_ONLINK) {
2920                 if (!dev) {
2921                         NL_SET_ERR_MSG(extack,
2922                                        "Nexthop device required for onlink");
2923                         goto out;
2924                 }
2925
2926                 if (!(dev->flags & IFF_UP)) {
2927                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2928                         err = -ENETDOWN;
2929                         goto out;
2930                 }
2931
2932                 fib6_nh->nh_flags |= RTNH_F_ONLINK;
2933         }
2934
2935         if (cfg->fc_encap) {
2936                 struct lwtunnel_state *lwtstate;
2937
2938                 err = lwtunnel_build_state(cfg->fc_encap_type,
2939                                            cfg->fc_encap, AF_INET6, cfg,
2940                                            &lwtstate, extack);
2941                 if (err)
2942                         goto out;
2943
2944                 fib6_nh->nh_lwtstate = lwtstate_get(lwtstate);
2945         }
2946
2947         fib6_nh->nh_weight = 1;
2948
2949         /* We cannot add true routes via loopback here,
2950          * they would result in kernel looping; promote them to reject routes
2951          */
2952         addr_type = ipv6_addr_type(&cfg->fc_dst);
2953         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2954                 /* hold loopback dev/idev if we haven't done so. */
2955                 if (dev != net->loopback_dev) {
2956                         if (dev) {
2957                                 dev_put(dev);
2958                                 in6_dev_put(idev);
2959                         }
2960                         dev = net->loopback_dev;
2961                         dev_hold(dev);
2962                         idev = in6_dev_get(dev);
2963                         if (!idev) {
2964                                 err = -ENODEV;
2965                                 goto out;
2966                         }
2967                 }
2968                 goto set_dev;
2969         }
2970
2971         if (cfg->fc_flags & RTF_GATEWAY) {
2972                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2973                 if (err)
2974                         goto out;
2975
2976                 fib6_nh->nh_gw = cfg->fc_gateway;
2977                 fib6_nh->fib_nh_has_gw = 1;
2978         }
2979
2980         err = -ENODEV;
2981         if (!dev)
2982                 goto out;
2983
2984         if (idev->cnf.disable_ipv6) {
2985                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2986                 err = -EACCES;
2987                 goto out;
2988         }
2989
2990         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
2991                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2992                 err = -ENETDOWN;
2993                 goto out;
2994         }
2995
2996         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2997             !netif_carrier_ok(dev))
2998                 fib6_nh->nh_flags |= RTNH_F_LINKDOWN;
2999
3000 set_dev:
3001         fib6_nh->nh_dev = dev;
3002         err = 0;
3003 out:
3004         if (idev)
3005                 in6_dev_put(idev);
3006
3007         if (err) {
3008                 lwtstate_put(fib6_nh->nh_lwtstate);
3009                 fib6_nh->nh_lwtstate = NULL;
3010                 if (dev)
3011                         dev_put(dev);
3012         }
3013
3014         return err;
3015 }
3016
3017 void fib6_nh_release(struct fib6_nh *fib6_nh)
3018 {
3019         lwtstate_put(fib6_nh->nh_lwtstate);
3020
3021         if (fib6_nh->nh_dev)
3022                 dev_put(fib6_nh->nh_dev);
3023 }
3024
3025 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3026                                               gfp_t gfp_flags,
3027                                               struct netlink_ext_ack *extack)
3028 {
3029         struct net *net = cfg->fc_nlinfo.nl_net;
3030         struct fib6_info *rt = NULL;
3031         struct fib6_table *table;
3032         int err = -EINVAL;
3033         int addr_type;
3034
3035         /* RTF_PCPU is an internal flag; can not be set by userspace */
3036         if (cfg->fc_flags & RTF_PCPU) {
3037                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3038                 goto out;
3039         }
3040
3041         /* RTF_CACHE is an internal flag; can not be set by userspace */
3042         if (cfg->fc_flags & RTF_CACHE) {
3043                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3044                 goto out;
3045         }
3046
3047         if (cfg->fc_type > RTN_MAX) {
3048                 NL_SET_ERR_MSG(extack, "Invalid route type");
3049                 goto out;
3050         }
3051
3052         if (cfg->fc_dst_len > 128) {
3053                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3054                 goto out;
3055         }
3056         if (cfg->fc_src_len > 128) {
3057                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3058                 goto out;
3059         }
3060 #ifndef CONFIG_IPV6_SUBTREES
3061         if (cfg->fc_src_len) {
3062                 NL_SET_ERR_MSG(extack,
3063                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3064                 goto out;
3065         }
3066 #endif
3067
3068         err = -ENOBUFS;
3069         if (cfg->fc_nlinfo.nlh &&
3070             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3071                 table = fib6_get_table(net, cfg->fc_table);
3072                 if (!table) {
3073                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3074                         table = fib6_new_table(net, cfg->fc_table);
3075                 }
3076         } else {
3077                 table = fib6_new_table(net, cfg->fc_table);
3078         }
3079
3080         if (!table)
3081                 goto out;
3082
3083         err = -ENOMEM;
3084         rt = fib6_info_alloc(gfp_flags);
3085         if (!rt)
3086                 goto out;
3087
3088         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3089                                                extack);
3090         if (IS_ERR(rt->fib6_metrics)) {
3091                 err = PTR_ERR(rt->fib6_metrics);
3092                 /* Do not leave garbage there. */
3093                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3094                 goto out;
3095         }
3096
3097         if (cfg->fc_flags & RTF_ADDRCONF)
3098                 rt->dst_nocount = true;
3099
3100         if (cfg->fc_flags & RTF_EXPIRES)
3101                 fib6_set_expires(rt, jiffies +
3102                                 clock_t_to_jiffies(cfg->fc_expires));
3103         else
3104                 fib6_clean_expires(rt);
3105
3106         if (cfg->fc_protocol == RTPROT_UNSPEC)
3107                 cfg->fc_protocol = RTPROT_BOOT;
3108         rt->fib6_protocol = cfg->fc_protocol;
3109
3110         rt->fib6_table = table;
3111         rt->fib6_metric = cfg->fc_metric;
3112         rt->fib6_type = cfg->fc_type;
3113         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3114
3115         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3116         rt->fib6_dst.plen = cfg->fc_dst_len;
3117         if (rt->fib6_dst.plen == 128)
3118                 rt->dst_host = true;
3119
3120 #ifdef CONFIG_IPV6_SUBTREES
3121         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3122         rt->fib6_src.plen = cfg->fc_src_len;
3123 #endif
3124         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3125         if (err)
3126                 goto out;
3127
3128         /* We cannot add true routes via loopback here,
3129          * they would result in kernel looping; promote them to reject routes
3130          */
3131         addr_type = ipv6_addr_type(&cfg->fc_dst);
3132         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.nh_dev, addr_type))
3133                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3134
3135         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3136                 struct net_device *dev = fib6_info_nh_dev(rt);
3137
3138                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3139                         NL_SET_ERR_MSG(extack, "Invalid source address");
3140                         err = -EINVAL;
3141                         goto out;
3142                 }
3143                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3144                 rt->fib6_prefsrc.plen = 128;
3145         } else
3146                 rt->fib6_prefsrc.plen = 0;
3147
3148         return rt;
3149 out:
3150         fib6_info_release(rt);
3151         return ERR_PTR(err);
3152 }
3153
3154 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3155                   struct netlink_ext_ack *extack)
3156 {
3157         struct fib6_info *rt;
3158         int err;
3159
3160         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3161         if (IS_ERR(rt))
3162                 return PTR_ERR(rt);
3163
3164         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3165         fib6_info_release(rt);
3166
3167         return err;
3168 }
3169
3170 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3171 {
3172         struct net *net = info->nl_net;
3173         struct fib6_table *table;
3174         int err;
3175
3176         if (rt == net->ipv6.fib6_null_entry) {
3177                 err = -ENOENT;
3178                 goto out;
3179         }
3180
3181         table = rt->fib6_table;
3182         spin_lock_bh(&table->tb6_lock);
3183         err = fib6_del(rt, info);
3184         spin_unlock_bh(&table->tb6_lock);
3185
3186 out:
3187         fib6_info_release(rt);
3188         return err;
3189 }
3190
3191 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3192 {
3193         struct nl_info info = { .nl_net = net };
3194
3195         return __ip6_del_rt(rt, &info);
3196 }
3197
3198 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3199 {
3200         struct nl_info *info = &cfg->fc_nlinfo;
3201         struct net *net = info->nl_net;
3202         struct sk_buff *skb = NULL;
3203         struct fib6_table *table;
3204         int err = -ENOENT;
3205
3206         if (rt == net->ipv6.fib6_null_entry)
3207                 goto out_put;
3208         table = rt->fib6_table;
3209         spin_lock_bh(&table->tb6_lock);
3210
3211         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3212                 struct fib6_info *sibling, *next_sibling;
3213
3214                 /* prefer to send a single notification with all hops */
3215                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3216                 if (skb) {
3217                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3218
3219                         if (rt6_fill_node(net, skb, rt, NULL,
3220                                           NULL, NULL, 0, RTM_DELROUTE,
3221                                           info->portid, seq, 0) < 0) {
3222                                 kfree_skb(skb);
3223                                 skb = NULL;
3224                         } else
3225                                 info->skip_notify = 1;
3226                 }
3227
3228                 list_for_each_entry_safe(sibling, next_sibling,
3229                                          &rt->fib6_siblings,
3230                                          fib6_siblings) {
3231                         err = fib6_del(sibling, info);
3232                         if (err)
3233                                 goto out_unlock;
3234                 }
3235         }
3236
3237         err = fib6_del(rt, info);
3238 out_unlock:
3239         spin_unlock_bh(&table->tb6_lock);
3240 out_put:
3241         fib6_info_release(rt);
3242
3243         if (skb) {
3244                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3245                             info->nlh, gfp_any());
3246         }
3247         return err;
3248 }
3249
3250 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3251 {
3252         int rc = -ESRCH;
3253
3254         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3255                 goto out;
3256
3257         if (cfg->fc_flags & RTF_GATEWAY &&
3258             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3259                 goto out;
3260
3261         rc = rt6_remove_exception_rt(rt);
3262 out:
3263         return rc;
3264 }
3265
3266 static int ip6_route_del(struct fib6_config *cfg,
3267                          struct netlink_ext_ack *extack)
3268 {
3269         struct rt6_info *rt_cache;
3270         struct fib6_table *table;
3271         struct fib6_info *rt;
3272         struct fib6_node *fn;
3273         int err = -ESRCH;
3274
3275         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3276         if (!table) {
3277                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3278                 return err;
3279         }
3280
3281         rcu_read_lock();
3282
3283         fn = fib6_locate(&table->tb6_root,
3284                          &cfg->fc_dst, cfg->fc_dst_len,
3285                          &cfg->fc_src, cfg->fc_src_len,
3286                          !(cfg->fc_flags & RTF_CACHE));
3287
3288         if (fn) {
3289                 for_each_fib6_node_rt_rcu(fn) {
3290                         if (cfg->fc_flags & RTF_CACHE) {
3291                                 int rc;
3292
3293                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3294                                                               &cfg->fc_src);
3295                                 if (rt_cache) {
3296                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3297                                         if (rc != -ESRCH) {
3298                                                 rcu_read_unlock();
3299                                                 return rc;
3300                                         }
3301                                 }
3302                                 continue;
3303                         }
3304                         if (cfg->fc_ifindex &&
3305                             (!rt->fib6_nh.nh_dev ||
3306                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3307                                 continue;
3308                         if (cfg->fc_flags & RTF_GATEWAY &&
3309                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3310                                 continue;
3311                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3312                                 continue;
3313                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3314                                 continue;
3315                         if (!fib6_info_hold_safe(rt))
3316                                 continue;
3317                         rcu_read_unlock();
3318
3319                         /* if gateway was specified only delete the one hop */
3320                         if (cfg->fc_flags & RTF_GATEWAY)
3321                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3322
3323                         return __ip6_del_rt_siblings(rt, cfg);
3324                 }
3325         }
3326         rcu_read_unlock();
3327
3328         return err;
3329 }
3330
3331 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3332 {
3333         struct netevent_redirect netevent;
3334         struct rt6_info *rt, *nrt = NULL;
3335         struct ndisc_options ndopts;
3336         struct inet6_dev *in6_dev;
3337         struct neighbour *neigh;
3338         struct fib6_info *from;
3339         struct rd_msg *msg;
3340         int optlen, on_link;
3341         u8 *lladdr;
3342
3343         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3344         optlen -= sizeof(*msg);
3345
3346         if (optlen < 0) {
3347                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3348                 return;
3349         }
3350
3351         msg = (struct rd_msg *)icmp6_hdr(skb);
3352
3353         if (ipv6_addr_is_multicast(&msg->dest)) {
3354                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3355                 return;
3356         }
3357
3358         on_link = 0;
3359         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3360                 on_link = 1;
3361         } else if (ipv6_addr_type(&msg->target) !=
3362                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3363                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3364                 return;
3365         }
3366
3367         in6_dev = __in6_dev_get(skb->dev);
3368         if (!in6_dev)
3369                 return;
3370         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3371                 return;
3372
3373         /* RFC2461 8.1:
3374          *      The IP source address of the Redirect MUST be the same as the current
3375          *      first-hop router for the specified ICMP Destination Address.
3376          */
3377
3378         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3379                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3380                 return;
3381         }
3382
3383         lladdr = NULL;
3384         if (ndopts.nd_opts_tgt_lladdr) {
3385                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3386                                              skb->dev);
3387                 if (!lladdr) {
3388                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3389                         return;
3390                 }
3391         }
3392
3393         rt = (struct rt6_info *) dst;
3394         if (rt->rt6i_flags & RTF_REJECT) {
3395                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3396                 return;
3397         }
3398
3399         /* Redirect received -> path was valid.
3400          * Look, redirects are sent only in response to data packets,
3401          * so that this nexthop apparently is reachable. --ANK
3402          */
3403         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3404
3405         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3406         if (!neigh)
3407                 return;
3408
3409         /*
3410          *      We have finally decided to accept it.
3411          */
3412
3413         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3414                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3415                      NEIGH_UPDATE_F_OVERRIDE|
3416                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3417                                      NEIGH_UPDATE_F_ISROUTER)),
3418                      NDISC_REDIRECT, &ndopts);
3419
3420         rcu_read_lock();
3421         from = rcu_dereference(rt->from);
3422         /* This fib6_info_hold() is safe here because we hold reference to rt
3423          * and rt already holds reference to fib6_info.
3424          */
3425         fib6_info_hold(from);
3426         rcu_read_unlock();
3427
3428         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3429         if (!nrt)
3430                 goto out;
3431
3432         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3433         if (on_link)
3434                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3435
3436         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3437
3438         /* No need to remove rt from the exception table if rt is
3439          * a cached route because rt6_insert_exception() will
3440          * takes care of it
3441          */
3442         if (rt6_insert_exception(nrt, from)) {
3443                 dst_release_immediate(&nrt->dst);
3444                 goto out;
3445         }
3446
3447         netevent.old = &rt->dst;
3448         netevent.new = &nrt->dst;
3449         netevent.daddr = &msg->dest;
3450         netevent.neigh = neigh;
3451         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3452
3453 out:
3454         fib6_info_release(from);
3455         neigh_release(neigh);
3456 }
3457
3458 #ifdef CONFIG_IPV6_ROUTE_INFO
3459 static struct fib6_info *rt6_get_route_info(struct net *net,
3460                                            const struct in6_addr *prefix, int prefixlen,
3461                                            const struct in6_addr *gwaddr,
3462                                            struct net_device *dev)
3463 {
3464         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3465         int ifindex = dev->ifindex;
3466         struct fib6_node *fn;
3467         struct fib6_info *rt = NULL;
3468         struct fib6_table *table;
3469
3470         table = fib6_get_table(net, tb_id);
3471         if (!table)
3472                 return NULL;
3473
3474         rcu_read_lock();
3475         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3476         if (!fn)
3477                 goto out;
3478
3479         for_each_fib6_node_rt_rcu(fn) {
3480                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3481                         continue;
3482                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3483                     !rt->fib6_nh.fib_nh_has_gw)
3484                         continue;
3485                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3486                         continue;
3487                 if (!fib6_info_hold_safe(rt))
3488                         continue;
3489                 break;
3490         }
3491 out:
3492         rcu_read_unlock();
3493         return rt;
3494 }
3495
3496 static struct fib6_info *rt6_add_route_info(struct net *net,
3497                                            const struct in6_addr *prefix, int prefixlen,
3498                                            const struct in6_addr *gwaddr,
3499                                            struct net_device *dev,
3500                                            unsigned int pref)
3501 {
3502         struct fib6_config cfg = {
3503                 .fc_metric      = IP6_RT_PRIO_USER,
3504                 .fc_ifindex     = dev->ifindex,
3505                 .fc_dst_len     = prefixlen,
3506                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3507                                   RTF_UP | RTF_PREF(pref),
3508                 .fc_protocol = RTPROT_RA,
3509                 .fc_type = RTN_UNICAST,
3510                 .fc_nlinfo.portid = 0,
3511                 .fc_nlinfo.nlh = NULL,
3512                 .fc_nlinfo.nl_net = net,
3513         };
3514
3515         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3516         cfg.fc_dst = *prefix;
3517         cfg.fc_gateway = *gwaddr;
3518
3519         /* We should treat it as a default route if prefix length is 0. */
3520         if (!prefixlen)
3521                 cfg.fc_flags |= RTF_DEFAULT;
3522
3523         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3524
3525         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3526 }
3527 #endif
3528
3529 struct fib6_info *rt6_get_dflt_router(struct net *net,
3530                                      const struct in6_addr *addr,
3531                                      struct net_device *dev)
3532 {
3533         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3534         struct fib6_info *rt;
3535         struct fib6_table *table;
3536
3537         table = fib6_get_table(net, tb_id);
3538         if (!table)
3539                 return NULL;
3540
3541         rcu_read_lock();
3542         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3543                 if (dev == rt->fib6_nh.nh_dev &&
3544                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3545                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3546                         break;
3547         }
3548         if (rt && !fib6_info_hold_safe(rt))
3549                 rt = NULL;
3550         rcu_read_unlock();
3551         return rt;
3552 }
3553
3554 struct fib6_info *rt6_add_dflt_router(struct net *net,
3555                                      const struct in6_addr *gwaddr,
3556                                      struct net_device *dev,
3557                                      unsigned int pref)
3558 {
3559         struct fib6_config cfg = {
3560                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3561                 .fc_metric      = IP6_RT_PRIO_USER,
3562                 .fc_ifindex     = dev->ifindex,
3563                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3564                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3565                 .fc_protocol = RTPROT_RA,
3566                 .fc_type = RTN_UNICAST,
3567                 .fc_nlinfo.portid = 0,
3568                 .fc_nlinfo.nlh = NULL,
3569                 .fc_nlinfo.nl_net = net,
3570         };
3571
3572         cfg.fc_gateway = *gwaddr;
3573
3574         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3575                 struct fib6_table *table;
3576
3577                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3578                 if (table)
3579                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3580         }
3581
3582         return rt6_get_dflt_router(net, gwaddr, dev);
3583 }
3584
3585 static void __rt6_purge_dflt_routers(struct net *net,
3586                                      struct fib6_table *table)
3587 {
3588         struct fib6_info *rt;
3589
3590 restart:
3591         rcu_read_lock();
3592         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3593                 struct net_device *dev = fib6_info_nh_dev(rt);
3594                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3595
3596                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3597                     (!idev || idev->cnf.accept_ra != 2) &&
3598                     fib6_info_hold_safe(rt)) {
3599                         rcu_read_unlock();
3600                         ip6_del_rt(net, rt);
3601                         goto restart;
3602                 }
3603         }
3604         rcu_read_unlock();
3605
3606         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3607 }
3608
3609 void rt6_purge_dflt_routers(struct net *net)
3610 {
3611         struct fib6_table *table;
3612         struct hlist_head *head;
3613         unsigned int h;
3614
3615         rcu_read_lock();
3616
3617         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3618                 head = &net->ipv6.fib_table_hash[h];
3619                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3620                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3621                                 __rt6_purge_dflt_routers(net, table);
3622                 }
3623         }
3624
3625         rcu_read_unlock();
3626 }
3627
3628 static void rtmsg_to_fib6_config(struct net *net,
3629                                  struct in6_rtmsg *rtmsg,
3630                                  struct fib6_config *cfg)
3631 {
3632         *cfg = (struct fib6_config){
3633                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3634                          : RT6_TABLE_MAIN,
3635                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3636                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3637                 .fc_expires = rtmsg->rtmsg_info,
3638                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3639                 .fc_src_len = rtmsg->rtmsg_src_len,
3640                 .fc_flags = rtmsg->rtmsg_flags,
3641                 .fc_type = rtmsg->rtmsg_type,
3642
3643                 .fc_nlinfo.nl_net = net,
3644
3645                 .fc_dst = rtmsg->rtmsg_dst,
3646                 .fc_src = rtmsg->rtmsg_src,
3647                 .fc_gateway = rtmsg->rtmsg_gateway,
3648         };
3649 }
3650
3651 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3652 {
3653         struct fib6_config cfg;
3654         struct in6_rtmsg rtmsg;
3655         int err;
3656
3657         switch (cmd) {
3658         case SIOCADDRT:         /* Add a route */
3659         case SIOCDELRT:         /* Delete a route */
3660                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3661                         return -EPERM;
3662                 err = copy_from_user(&rtmsg, arg,
3663                                      sizeof(struct in6_rtmsg));
3664                 if (err)
3665                         return -EFAULT;
3666
3667                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3668
3669                 rtnl_lock();
3670                 switch (cmd) {
3671                 case SIOCADDRT:
3672                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3673                         break;
3674                 case SIOCDELRT:
3675                         err = ip6_route_del(&cfg, NULL);
3676                         break;
3677                 default:
3678                         err = -EINVAL;
3679                 }
3680                 rtnl_unlock();
3681
3682                 return err;
3683         }
3684
3685         return -EINVAL;
3686 }
3687
3688 /*
3689  *      Drop the packet on the floor
3690  */
3691
3692 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3693 {
3694         int type;
3695         struct dst_entry *dst = skb_dst(skb);
3696         switch (ipstats_mib_noroutes) {
3697         case IPSTATS_MIB_INNOROUTES:
3698                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3699                 if (type == IPV6_ADDR_ANY) {
3700                         IP6_INC_STATS(dev_net(dst->dev),
3701                                       __in6_dev_get_safely(skb->dev),
3702                                       IPSTATS_MIB_INADDRERRORS);
3703                         break;
3704                 }
3705                 /* FALLTHROUGH */
3706         case IPSTATS_MIB_OUTNOROUTES:
3707                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3708                               ipstats_mib_noroutes);
3709                 break;
3710         }
3711         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3712         kfree_skb(skb);
3713         return 0;
3714 }
3715
3716 static int ip6_pkt_discard(struct sk_buff *skb)
3717 {
3718         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3719 }
3720
3721 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3722 {
3723         skb->dev = skb_dst(skb)->dev;
3724         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3725 }
3726
3727 static int ip6_pkt_prohibit(struct sk_buff *skb)
3728 {
3729         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3730 }
3731
3732 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3733 {
3734         skb->dev = skb_dst(skb)->dev;
3735         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3736 }
3737
3738 /*
3739  *      Allocate a dst for local (unicast / anycast) address.
3740  */
3741
3742 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3743                                      struct inet6_dev *idev,
3744                                      const struct in6_addr *addr,
3745                                      bool anycast, gfp_t gfp_flags)
3746 {
3747         struct fib6_config cfg = {
3748                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3749                 .fc_ifindex = idev->dev->ifindex,
3750                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3751                 .fc_dst = *addr,
3752                 .fc_dst_len = 128,
3753                 .fc_protocol = RTPROT_KERNEL,
3754                 .fc_nlinfo.nl_net = net,
3755                 .fc_ignore_dev_down = true,
3756         };
3757
3758         if (anycast) {
3759                 cfg.fc_type = RTN_ANYCAST;
3760                 cfg.fc_flags |= RTF_ANYCAST;
3761         } else {
3762                 cfg.fc_type = RTN_LOCAL;
3763                 cfg.fc_flags |= RTF_LOCAL;
3764         }
3765
3766         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3767 }
3768
3769 /* remove deleted ip from prefsrc entries */
3770 struct arg_dev_net_ip {
3771         struct net_device *dev;
3772         struct net *net;
3773         struct in6_addr *addr;
3774 };
3775
3776 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3777 {
3778         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3779         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3780         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3781
3782         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3783             rt != net->ipv6.fib6_null_entry &&
3784             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3785                 spin_lock_bh(&rt6_exception_lock);
3786                 /* remove prefsrc entry */
3787                 rt->fib6_prefsrc.plen = 0;
3788                 spin_unlock_bh(&rt6_exception_lock);
3789         }
3790         return 0;
3791 }
3792
3793 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3794 {
3795         struct net *net = dev_net(ifp->idev->dev);
3796         struct arg_dev_net_ip adni = {
3797                 .dev = ifp->idev->dev,
3798                 .net = net,
3799                 .addr = &ifp->addr,
3800         };
3801         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3802 }
3803
3804 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3805
3806 /* Remove routers and update dst entries when gateway turn into host. */
3807 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3808 {
3809         struct in6_addr *gateway = (struct in6_addr *)arg;
3810
3811         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3812             rt->fib6_nh.fib_nh_has_gw &&
3813             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3814                 return -1;
3815         }
3816
3817         /* Further clean up cached routes in exception table.
3818          * This is needed because cached route may have a different
3819          * gateway than its 'parent' in the case of an ip redirect.
3820          */
3821         rt6_exceptions_clean_tohost(rt, gateway);
3822
3823         return 0;
3824 }
3825
3826 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3827 {
3828         fib6_clean_all(net, fib6_clean_tohost, gateway);
3829 }
3830
3831 struct arg_netdev_event {
3832         const struct net_device *dev;
3833         union {
3834                 unsigned int nh_flags;
3835                 unsigned long event;
3836         };
3837 };
3838
3839 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3840 {
3841         struct fib6_info *iter;
3842         struct fib6_node *fn;
3843
3844         fn = rcu_dereference_protected(rt->fib6_node,
3845                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3846         iter = rcu_dereference_protected(fn->leaf,
3847                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3848         while (iter) {
3849                 if (iter->fib6_metric == rt->fib6_metric &&
3850                     rt6_qualify_for_ecmp(iter))
3851                         return iter;
3852                 iter = rcu_dereference_protected(iter->fib6_next,
3853                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3854         }
3855
3856         return NULL;
3857 }
3858
3859 static bool rt6_is_dead(const struct fib6_info *rt)
3860 {
3861         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3862             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3863              ip6_ignore_linkdown(rt->fib6_nh.nh_dev)))
3864                 return true;
3865
3866         return false;
3867 }
3868
3869 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3870 {
3871         struct fib6_info *iter;
3872         int total = 0;
3873
3874         if (!rt6_is_dead(rt))
3875                 total += rt->fib6_nh.nh_weight;
3876
3877         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3878                 if (!rt6_is_dead(iter))
3879                         total += iter->fib6_nh.nh_weight;
3880         }
3881
3882         return total;
3883 }
3884
3885 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3886 {
3887         int upper_bound = -1;
3888
3889         if (!rt6_is_dead(rt)) {
3890                 *weight += rt->fib6_nh.nh_weight;
3891                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3892                                                     total) - 1;
3893         }
3894         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3895 }
3896
3897 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3898 {
3899         struct fib6_info *iter;
3900         int weight = 0;
3901
3902         rt6_upper_bound_set(rt, &weight, total);
3903
3904         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3905                 rt6_upper_bound_set(iter, &weight, total);
3906 }
3907
3908 void rt6_multipath_rebalance(struct fib6_info *rt)
3909 {
3910         struct fib6_info *first;
3911         int total;
3912
3913         /* In case the entire multipath route was marked for flushing,
3914          * then there is no need to rebalance upon the removal of every
3915          * sibling route.
3916          */
3917         if (!rt->fib6_nsiblings || rt->should_flush)
3918                 return;
3919
3920         /* During lookup routes are evaluated in order, so we need to
3921          * make sure upper bounds are assigned from the first sibling
3922          * onwards.
3923          */
3924         first = rt6_multipath_first_sibling(rt);
3925         if (WARN_ON_ONCE(!first))
3926                 return;
3927
3928         total = rt6_multipath_total_weight(first);
3929         rt6_multipath_upper_bound_set(first, total);
3930 }
3931
3932 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3933 {
3934         const struct arg_netdev_event *arg = p_arg;
3935         struct net *net = dev_net(arg->dev);
3936
3937         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3938                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3939                 fib6_update_sernum_upto_root(net, rt);
3940                 rt6_multipath_rebalance(rt);
3941         }
3942
3943         return 0;
3944 }
3945
3946 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3947 {
3948         struct arg_netdev_event arg = {
3949                 .dev = dev,
3950                 {
3951                         .nh_flags = nh_flags,
3952                 },
3953         };
3954
3955         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3956                 arg.nh_flags |= RTNH_F_LINKDOWN;
3957
3958         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3959 }
3960
3961 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3962                                    const struct net_device *dev)
3963 {
3964         struct fib6_info *iter;
3965
3966         if (rt->fib6_nh.nh_dev == dev)
3967                 return true;
3968         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3969                 if (iter->fib6_nh.nh_dev == dev)
3970                         return true;
3971
3972         return false;
3973 }
3974
3975 static void rt6_multipath_flush(struct fib6_info *rt)
3976 {
3977         struct fib6_info *iter;
3978
3979         rt->should_flush = 1;
3980         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3981                 iter->should_flush = 1;
3982 }
3983
3984 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3985                                              const struct net_device *down_dev)
3986 {
3987         struct fib6_info *iter;
3988         unsigned int dead = 0;
3989
3990         if (rt->fib6_nh.nh_dev == down_dev ||
3991             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3992                 dead++;
3993         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3994                 if (iter->fib6_nh.nh_dev == down_dev ||
3995                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3996                         dead++;
3997
3998         return dead;
3999 }
4000
4001 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4002                                        const struct net_device *dev,
4003                                        unsigned int nh_flags)
4004 {
4005         struct fib6_info *iter;
4006
4007         if (rt->fib6_nh.nh_dev == dev)
4008                 rt->fib6_nh.nh_flags |= nh_flags;
4009         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4010                 if (iter->fib6_nh.nh_dev == dev)
4011                         iter->fib6_nh.nh_flags |= nh_flags;
4012 }
4013
4014 /* called with write lock held for table with rt */
4015 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4016 {
4017         const struct arg_netdev_event *arg = p_arg;
4018         const struct net_device *dev = arg->dev;
4019         struct net *net = dev_net(dev);
4020
4021         if (rt == net->ipv6.fib6_null_entry)
4022                 return 0;
4023
4024         switch (arg->event) {
4025         case NETDEV_UNREGISTER:
4026                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4027         case NETDEV_DOWN:
4028                 if (rt->should_flush)
4029                         return -1;
4030                 if (!rt->fib6_nsiblings)
4031                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4032                 if (rt6_multipath_uses_dev(rt, dev)) {
4033                         unsigned int count;
4034
4035                         count = rt6_multipath_dead_count(rt, dev);
4036                         if (rt->fib6_nsiblings + 1 == count) {
4037                                 rt6_multipath_flush(rt);
4038                                 return -1;
4039                         }
4040                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4041                                                    RTNH_F_LINKDOWN);
4042                         fib6_update_sernum(net, rt);
4043                         rt6_multipath_rebalance(rt);
4044                 }
4045                 return -2;
4046         case NETDEV_CHANGE:
4047                 if (rt->fib6_nh.nh_dev != dev ||
4048                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4049                         break;
4050                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4051                 rt6_multipath_rebalance(rt);
4052                 break;
4053         }
4054
4055         return 0;
4056 }
4057
4058 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4059 {
4060         struct arg_netdev_event arg = {
4061                 .dev = dev,
4062                 {
4063                         .event = event,
4064                 },
4065         };
4066         struct net *net = dev_net(dev);
4067
4068         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4069                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4070         else
4071                 fib6_clean_all(net, fib6_ifdown, &arg);
4072 }
4073
4074 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4075 {
4076         rt6_sync_down_dev(dev, event);
4077         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4078         neigh_ifdown(&nd_tbl, dev);
4079 }
4080
4081 struct rt6_mtu_change_arg {
4082         struct net_device *dev;
4083         unsigned int mtu;
4084 };
4085
4086 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4087 {
4088         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4089         struct inet6_dev *idev;
4090
4091         /* In IPv6 pmtu discovery is not optional,
4092            so that RTAX_MTU lock cannot disable it.
4093            We still use this lock to block changes
4094            caused by addrconf/ndisc.
4095         */
4096
4097         idev = __in6_dev_get(arg->dev);
4098         if (!idev)
4099                 return 0;
4100
4101         /* For administrative MTU increase, there is no way to discover
4102            IPv6 PMTU increase, so PMTU increase should be updated here.
4103            Since RFC 1981 doesn't include administrative MTU increase
4104            update PMTU increase is a MUST. (i.e. jumbo frame)
4105          */
4106         if (rt->fib6_nh.nh_dev == arg->dev &&
4107             !fib6_metric_locked(rt, RTAX_MTU)) {
4108                 u32 mtu = rt->fib6_pmtu;
4109
4110                 if (mtu >= arg->mtu ||
4111                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4112                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4113
4114                 spin_lock_bh(&rt6_exception_lock);
4115                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4116                 spin_unlock_bh(&rt6_exception_lock);
4117         }
4118         return 0;
4119 }
4120
4121 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4122 {
4123         struct rt6_mtu_change_arg arg = {
4124                 .dev = dev,
4125                 .mtu = mtu,
4126         };
4127
4128         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4129 }
4130
4131 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4132         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4133         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4134         [RTA_OIF]               = { .type = NLA_U32 },
4135         [RTA_IIF]               = { .type = NLA_U32 },
4136         [RTA_PRIORITY]          = { .type = NLA_U32 },
4137         [RTA_METRICS]           = { .type = NLA_NESTED },
4138         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4139         [RTA_PREF]              = { .type = NLA_U8 },
4140         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4141         [RTA_ENCAP]             = { .type = NLA_NESTED },
4142         [RTA_EXPIRES]           = { .type = NLA_U32 },
4143         [RTA_UID]               = { .type = NLA_U32 },
4144         [RTA_MARK]              = { .type = NLA_U32 },
4145         [RTA_TABLE]             = { .type = NLA_U32 },
4146         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4147         [RTA_SPORT]             = { .type = NLA_U16 },
4148         [RTA_DPORT]             = { .type = NLA_U16 },
4149 };
4150
4151 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4152                               struct fib6_config *cfg,
4153                               struct netlink_ext_ack *extack)
4154 {
4155         struct rtmsg *rtm;
4156         struct nlattr *tb[RTA_MAX+1];
4157         unsigned int pref;
4158         int err;
4159
4160         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4161                           extack);
4162         if (err < 0)
4163                 goto errout;
4164
4165         err = -EINVAL;
4166         rtm = nlmsg_data(nlh);
4167
4168         *cfg = (struct fib6_config){
4169                 .fc_table = rtm->rtm_table,
4170                 .fc_dst_len = rtm->rtm_dst_len,
4171                 .fc_src_len = rtm->rtm_src_len,
4172                 .fc_flags = RTF_UP,
4173                 .fc_protocol = rtm->rtm_protocol,
4174                 .fc_type = rtm->rtm_type,
4175
4176                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4177                 .fc_nlinfo.nlh = nlh,
4178                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4179         };
4180
4181         if (rtm->rtm_type == RTN_UNREACHABLE ||
4182             rtm->rtm_type == RTN_BLACKHOLE ||
4183             rtm->rtm_type == RTN_PROHIBIT ||
4184             rtm->rtm_type == RTN_THROW)
4185                 cfg->fc_flags |= RTF_REJECT;
4186
4187         if (rtm->rtm_type == RTN_LOCAL)
4188                 cfg->fc_flags |= RTF_LOCAL;
4189
4190         if (rtm->rtm_flags & RTM_F_CLONED)
4191                 cfg->fc_flags |= RTF_CACHE;
4192
4193         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4194
4195         if (tb[RTA_GATEWAY]) {
4196                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4197                 cfg->fc_flags |= RTF_GATEWAY;
4198         }
4199         if (tb[RTA_VIA]) {
4200                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4201                 goto errout;
4202         }
4203
4204         if (tb[RTA_DST]) {
4205                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4206
4207                 if (nla_len(tb[RTA_DST]) < plen)
4208                         goto errout;
4209
4210                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4211         }
4212
4213         if (tb[RTA_SRC]) {
4214                 int plen = (rtm->rtm_src_len + 7) >> 3;
4215
4216                 if (nla_len(tb[RTA_SRC]) < plen)
4217                         goto errout;
4218
4219                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4220         }
4221
4222         if (tb[RTA_PREFSRC])
4223                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4224
4225         if (tb[RTA_OIF])
4226                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4227
4228         if (tb[RTA_PRIORITY])
4229                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4230
4231         if (tb[RTA_METRICS]) {
4232                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4233                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4234         }
4235
4236         if (tb[RTA_TABLE])
4237                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4238
4239         if (tb[RTA_MULTIPATH]) {
4240                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4241                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4242
4243                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4244                                                      cfg->fc_mp_len, extack);
4245                 if (err < 0)
4246                         goto errout;
4247         }
4248
4249         if (tb[RTA_PREF]) {
4250                 pref = nla_get_u8(tb[RTA_PREF]);
4251                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4252                     pref != ICMPV6_ROUTER_PREF_HIGH)
4253                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4254                 cfg->fc_flags |= RTF_PREF(pref);
4255         }
4256
4257         if (tb[RTA_ENCAP])
4258                 cfg->fc_encap = tb[RTA_ENCAP];
4259
4260         if (tb[RTA_ENCAP_TYPE]) {
4261                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4262
4263                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4264                 if (err < 0)
4265                         goto errout;
4266         }
4267
4268         if (tb[RTA_EXPIRES]) {
4269                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4270
4271                 if (addrconf_finite_timeout(timeout)) {
4272                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4273                         cfg->fc_flags |= RTF_EXPIRES;
4274                 }
4275         }
4276
4277         err = 0;
4278 errout:
4279         return err;
4280 }
4281
4282 struct rt6_nh {
4283         struct fib6_info *fib6_info;
4284         struct fib6_config r_cfg;
4285         struct list_head next;
4286 };
4287
4288 static int ip6_route_info_append(struct net *net,
4289                                  struct list_head *rt6_nh_list,
4290                                  struct fib6_info *rt,
4291                                  struct fib6_config *r_cfg)
4292 {
4293         struct rt6_nh *nh;
4294         int err = -EEXIST;
4295
4296         list_for_each_entry(nh, rt6_nh_list, next) {
4297                 /* check if fib6_info already exists */
4298                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4299                         return err;
4300         }
4301
4302         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4303         if (!nh)
4304                 return -ENOMEM;
4305         nh->fib6_info = rt;
4306         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4307         list_add_tail(&nh->next, rt6_nh_list);
4308
4309         return 0;
4310 }
4311
4312 static void ip6_route_mpath_notify(struct fib6_info *rt,
4313                                    struct fib6_info *rt_last,
4314                                    struct nl_info *info,
4315                                    __u16 nlflags)
4316 {
4317         /* if this is an APPEND route, then rt points to the first route
4318          * inserted and rt_last points to last route inserted. Userspace
4319          * wants a consistent dump of the route which starts at the first
4320          * nexthop. Since sibling routes are always added at the end of
4321          * the list, find the first sibling of the last route appended
4322          */
4323         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4324                 rt = list_first_entry(&rt_last->fib6_siblings,
4325                                       struct fib6_info,
4326                                       fib6_siblings);
4327         }
4328
4329         if (rt)
4330                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4331 }
4332
4333 static int ip6_route_multipath_add(struct fib6_config *cfg,
4334                                    struct netlink_ext_ack *extack)
4335 {
4336         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4337         struct nl_info *info = &cfg->fc_nlinfo;
4338         struct fib6_config r_cfg;
4339         struct rtnexthop *rtnh;
4340         struct fib6_info *rt;
4341         struct rt6_nh *err_nh;
4342         struct rt6_nh *nh, *nh_safe;
4343         __u16 nlflags;
4344         int remaining;
4345         int attrlen;
4346         int err = 1;
4347         int nhn = 0;
4348         int replace = (cfg->fc_nlinfo.nlh &&
4349                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4350         LIST_HEAD(rt6_nh_list);
4351
4352         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4353         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4354                 nlflags |= NLM_F_APPEND;
4355
4356         remaining = cfg->fc_mp_len;
4357         rtnh = (struct rtnexthop *)cfg->fc_mp;
4358
4359         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4360          * fib6_info structs per nexthop
4361          */
4362         while (rtnh_ok(rtnh, remaining)) {
4363                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4364                 if (rtnh->rtnh_ifindex)
4365                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4366
4367                 attrlen = rtnh_attrlen(rtnh);
4368                 if (attrlen > 0) {
4369                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4370
4371                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4372                         if (nla) {
4373                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4374                                 r_cfg.fc_flags |= RTF_GATEWAY;
4375                         }
4376                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4377                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4378                         if (nla)
4379                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4380                 }
4381
4382                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4383                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4384                 if (IS_ERR(rt)) {
4385                         err = PTR_ERR(rt);
4386                         rt = NULL;
4387                         goto cleanup;
4388                 }
4389                 if (!rt6_qualify_for_ecmp(rt)) {
4390                         err = -EINVAL;
4391                         NL_SET_ERR_MSG(extack,
4392                                        "Device only routes can not be added for IPv6 using the multipath API.");
4393                         fib6_info_release(rt);
4394                         goto cleanup;
4395                 }
4396
4397                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4398
4399                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4400                                             rt, &r_cfg);
4401                 if (err) {
4402                         fib6_info_release(rt);
4403                         goto cleanup;
4404                 }
4405
4406                 rtnh = rtnh_next(rtnh, &remaining);
4407         }
4408
4409         /* for add and replace send one notification with all nexthops.
4410          * Skip the notification in fib6_add_rt2node and send one with
4411          * the full route when done
4412          */
4413         info->skip_notify = 1;
4414
4415         err_nh = NULL;
4416         list_for_each_entry(nh, &rt6_nh_list, next) {
4417                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4418                 fib6_info_release(nh->fib6_info);
4419
4420                 if (!err) {
4421                         /* save reference to last route successfully inserted */
4422                         rt_last = nh->fib6_info;
4423
4424                         /* save reference to first route for notification */
4425                         if (!rt_notif)
4426                                 rt_notif = nh->fib6_info;
4427                 }
4428
4429                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4430                 nh->fib6_info = NULL;
4431                 if (err) {
4432                         if (replace && nhn)
4433                                 NL_SET_ERR_MSG_MOD(extack,
4434                                                    "multipath route replace failed (check consistency of installed routes)");
4435                         err_nh = nh;
4436                         goto add_errout;
4437                 }
4438
4439                 /* Because each route is added like a single route we remove
4440                  * these flags after the first nexthop: if there is a collision,
4441                  * we have already failed to add the first nexthop:
4442                  * fib6_add_rt2node() has rejected it; when replacing, old
4443                  * nexthops have been replaced by first new, the rest should
4444                  * be added to it.
4445                  */
4446                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4447                                                      NLM_F_REPLACE);
4448                 nhn++;
4449         }
4450
4451         /* success ... tell user about new route */
4452         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4453         goto cleanup;
4454
4455 add_errout:
4456         /* send notification for routes that were added so that
4457          * the delete notifications sent by ip6_route_del are
4458          * coherent
4459          */
4460         if (rt_notif)
4461                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4462
4463         /* Delete routes that were already added */
4464         list_for_each_entry(nh, &rt6_nh_list, next) {
4465                 if (err_nh == nh)
4466                         break;
4467                 ip6_route_del(&nh->r_cfg, extack);
4468         }
4469
4470 cleanup:
4471         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4472                 if (nh->fib6_info)
4473                         fib6_info_release(nh->fib6_info);
4474                 list_del(&nh->next);
4475                 kfree(nh);
4476         }
4477
4478         return err;
4479 }
4480
4481 static int ip6_route_multipath_del(struct fib6_config *cfg,
4482                                    struct netlink_ext_ack *extack)
4483 {
4484         struct fib6_config r_cfg;
4485         struct rtnexthop *rtnh;
4486         int remaining;
4487         int attrlen;
4488         int err = 1, last_err = 0;
4489
4490         remaining = cfg->fc_mp_len;
4491         rtnh = (struct rtnexthop *)cfg->fc_mp;
4492
4493         /* Parse a Multipath Entry */
4494         while (rtnh_ok(rtnh, remaining)) {
4495                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4496                 if (rtnh->rtnh_ifindex)
4497                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4498
4499                 attrlen = rtnh_attrlen(rtnh);
4500                 if (attrlen > 0) {
4501                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4502
4503                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4504                         if (nla) {
4505                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4506                                 r_cfg.fc_flags |= RTF_GATEWAY;
4507                         }
4508                 }
4509                 err = ip6_route_del(&r_cfg, extack);
4510                 if (err)
4511                         last_err = err;
4512
4513                 rtnh = rtnh_next(rtnh, &remaining);
4514         }
4515
4516         return last_err;
4517 }
4518
4519 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4520                               struct netlink_ext_ack *extack)
4521 {
4522         struct fib6_config cfg;
4523         int err;
4524
4525         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4526         if (err < 0)
4527                 return err;
4528
4529         if (cfg.fc_mp)
4530                 return ip6_route_multipath_del(&cfg, extack);
4531         else {
4532                 cfg.fc_delete_all_nh = 1;
4533                 return ip6_route_del(&cfg, extack);
4534         }
4535 }
4536
4537 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4538                               struct netlink_ext_ack *extack)
4539 {
4540         struct fib6_config cfg;
4541         int err;
4542
4543         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4544         if (err < 0)
4545                 return err;
4546
4547         if (cfg.fc_metric == 0)
4548                 cfg.fc_metric = IP6_RT_PRIO_USER;
4549
4550         if (cfg.fc_mp)
4551                 return ip6_route_multipath_add(&cfg, extack);
4552         else
4553                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4554 }
4555
4556 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4557 {
4558         int nexthop_len = 0;
4559
4560         if (rt->fib6_nsiblings) {
4561                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4562                             + NLA_ALIGN(sizeof(struct rtnexthop))
4563                             + nla_total_size(16) /* RTA_GATEWAY */
4564                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4565
4566                 nexthop_len *= rt->fib6_nsiblings;
4567         }
4568
4569         return NLMSG_ALIGN(sizeof(struct rtmsg))
4570                + nla_total_size(16) /* RTA_SRC */
4571                + nla_total_size(16) /* RTA_DST */
4572                + nla_total_size(16) /* RTA_GATEWAY */
4573                + nla_total_size(16) /* RTA_PREFSRC */
4574                + nla_total_size(4) /* RTA_TABLE */
4575                + nla_total_size(4) /* RTA_IIF */
4576                + nla_total_size(4) /* RTA_OIF */
4577                + nla_total_size(4) /* RTA_PRIORITY */
4578                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4579                + nla_total_size(sizeof(struct rta_cacheinfo))
4580                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4581                + nla_total_size(1) /* RTA_PREF */
4582                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4583                + nexthop_len;
4584 }
4585
4586 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4587                             unsigned int *flags, bool skip_oif)
4588 {
4589         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4590                 *flags |= RTNH_F_DEAD;
4591
4592         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4593                 *flags |= RTNH_F_LINKDOWN;
4594
4595                 rcu_read_lock();
4596                 if (ip6_ignore_linkdown(rt->fib6_nh.nh_dev))
4597                         *flags |= RTNH_F_DEAD;
4598                 rcu_read_unlock();
4599         }
4600
4601         if (rt->fib6_nh.fib_nh_has_gw) {
4602                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4603                         goto nla_put_failure;
4604         }
4605
4606         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4607         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4608                 *flags |= RTNH_F_OFFLOAD;
4609
4610         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4611         if (!skip_oif && rt->fib6_nh.nh_dev &&
4612             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4613                 goto nla_put_failure;
4614
4615         if (rt->fib6_nh.nh_lwtstate &&
4616             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4617                 goto nla_put_failure;
4618
4619         return 0;
4620
4621 nla_put_failure:
4622         return -EMSGSIZE;
4623 }
4624
4625 /* add multipath next hop */
4626 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4627 {
4628         const struct net_device *dev = rt->fib6_nh.nh_dev;
4629         struct rtnexthop *rtnh;
4630         unsigned int flags = 0;
4631
4632         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4633         if (!rtnh)
4634                 goto nla_put_failure;
4635
4636         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4637         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4638
4639         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4640                 goto nla_put_failure;
4641
4642         rtnh->rtnh_flags = flags;
4643
4644         /* length of rtnetlink header + attributes */
4645         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4646
4647         return 0;
4648
4649 nla_put_failure:
4650         return -EMSGSIZE;
4651 }
4652
4653 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4654                          struct fib6_info *rt, struct dst_entry *dst,
4655                          struct in6_addr *dest, struct in6_addr *src,
4656                          int iif, int type, u32 portid, u32 seq,
4657                          unsigned int flags)
4658 {
4659         struct rt6_info *rt6 = (struct rt6_info *)dst;
4660         struct rt6key *rt6_dst, *rt6_src;
4661         u32 *pmetrics, table, rt6_flags;
4662         struct nlmsghdr *nlh;
4663         struct rtmsg *rtm;
4664         long expires = 0;
4665
4666         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4667         if (!nlh)
4668                 return -EMSGSIZE;
4669
4670         if (rt6) {
4671                 rt6_dst = &rt6->rt6i_dst;
4672                 rt6_src = &rt6->rt6i_src;
4673                 rt6_flags = rt6->rt6i_flags;
4674         } else {
4675                 rt6_dst = &rt->fib6_dst;
4676                 rt6_src = &rt->fib6_src;
4677                 rt6_flags = rt->fib6_flags;
4678         }
4679
4680         rtm = nlmsg_data(nlh);
4681         rtm->rtm_family = AF_INET6;
4682         rtm->rtm_dst_len = rt6_dst->plen;
4683         rtm->rtm_src_len = rt6_src->plen;
4684         rtm->rtm_tos = 0;
4685         if (rt->fib6_table)
4686                 table = rt->fib6_table->tb6_id;
4687         else
4688                 table = RT6_TABLE_UNSPEC;
4689         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4690         if (nla_put_u32(skb, RTA_TABLE, table))
4691                 goto nla_put_failure;
4692
4693         rtm->rtm_type = rt->fib6_type;
4694         rtm->rtm_flags = 0;
4695         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4696         rtm->rtm_protocol = rt->fib6_protocol;
4697
4698         if (rt6_flags & RTF_CACHE)
4699                 rtm->rtm_flags |= RTM_F_CLONED;
4700
4701         if (dest) {
4702                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4703                         goto nla_put_failure;
4704                 rtm->rtm_dst_len = 128;
4705         } else if (rtm->rtm_dst_len)
4706                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4707                         goto nla_put_failure;
4708 #ifdef CONFIG_IPV6_SUBTREES
4709         if (src) {
4710                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4711                         goto nla_put_failure;
4712                 rtm->rtm_src_len = 128;
4713         } else if (rtm->rtm_src_len &&
4714                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4715                 goto nla_put_failure;
4716 #endif
4717         if (iif) {
4718 #ifdef CONFIG_IPV6_MROUTE
4719                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4720                         int err = ip6mr_get_route(net, skb, rtm, portid);
4721
4722                         if (err == 0)
4723                                 return 0;
4724                         if (err < 0)
4725                                 goto nla_put_failure;
4726                 } else
4727 #endif
4728                         if (nla_put_u32(skb, RTA_IIF, iif))
4729                                 goto nla_put_failure;
4730         } else if (dest) {
4731                 struct in6_addr saddr_buf;
4732                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4733                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4734                         goto nla_put_failure;
4735         }
4736
4737         if (rt->fib6_prefsrc.plen) {
4738                 struct in6_addr saddr_buf;
4739                 saddr_buf = rt->fib6_prefsrc.addr;
4740                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4741                         goto nla_put_failure;
4742         }
4743
4744         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4745         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4746                 goto nla_put_failure;
4747
4748         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4749                 goto nla_put_failure;
4750
4751         /* For multipath routes, walk the siblings list and add
4752          * each as a nexthop within RTA_MULTIPATH.
4753          */
4754         if (rt6) {
4755                 if (rt6_flags & RTF_GATEWAY &&
4756                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4757                         goto nla_put_failure;
4758
4759                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4760                         goto nla_put_failure;
4761         } else if (rt->fib6_nsiblings) {
4762                 struct fib6_info *sibling, *next_sibling;
4763                 struct nlattr *mp;
4764
4765                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4766                 if (!mp)
4767                         goto nla_put_failure;
4768
4769                 if (rt6_add_nexthop(skb, rt) < 0)
4770                         goto nla_put_failure;
4771
4772                 list_for_each_entry_safe(sibling, next_sibling,
4773                                          &rt->fib6_siblings, fib6_siblings) {
4774                         if (rt6_add_nexthop(skb, sibling) < 0)
4775                                 goto nla_put_failure;
4776                 }
4777
4778                 nla_nest_end(skb, mp);
4779         } else {
4780                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4781                         goto nla_put_failure;
4782         }
4783
4784         if (rt6_flags & RTF_EXPIRES) {
4785                 expires = dst ? dst->expires : rt->expires;
4786                 expires -= jiffies;
4787         }
4788
4789         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4790                 goto nla_put_failure;
4791
4792         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4793                 goto nla_put_failure;
4794
4795
4796         nlmsg_end(skb, nlh);
4797         return 0;
4798
4799 nla_put_failure:
4800         nlmsg_cancel(skb, nlh);
4801         return -EMSGSIZE;
4802 }
4803
4804 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4805                                const struct net_device *dev)
4806 {
4807         if (f6i->fib6_nh.nh_dev == dev)
4808                 return true;
4809
4810         if (f6i->fib6_nsiblings) {
4811                 struct fib6_info *sibling, *next_sibling;
4812
4813                 list_for_each_entry_safe(sibling, next_sibling,
4814                                          &f6i->fib6_siblings, fib6_siblings) {
4815                         if (sibling->fib6_nh.nh_dev == dev)
4816                                 return true;
4817                 }
4818         }
4819
4820         return false;
4821 }
4822
4823 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4824 {
4825         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4826         struct fib_dump_filter *filter = &arg->filter;
4827         unsigned int flags = NLM_F_MULTI;
4828         struct net *net = arg->net;
4829
4830         if (rt == net->ipv6.fib6_null_entry)
4831                 return 0;
4832
4833         if ((filter->flags & RTM_F_PREFIX) &&
4834             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4835                 /* success since this is not a prefix route */
4836                 return 1;
4837         }
4838         if (filter->filter_set) {
4839                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4840                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4841                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4842                         return 1;
4843                 }
4844                 flags |= NLM_F_DUMP_FILTERED;
4845         }
4846
4847         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4848                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4849                              arg->cb->nlh->nlmsg_seq, flags);
4850 }
4851
4852 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4853                                         const struct nlmsghdr *nlh,
4854                                         struct nlattr **tb,
4855                                         struct netlink_ext_ack *extack)
4856 {
4857         struct rtmsg *rtm;
4858         int i, err;
4859
4860         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4861                 NL_SET_ERR_MSG_MOD(extack,
4862                                    "Invalid header for get route request");
4863                 return -EINVAL;
4864         }
4865
4866         if (!netlink_strict_get_check(skb))
4867                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4868                                    rtm_ipv6_policy, extack);
4869
4870         rtm = nlmsg_data(nlh);
4871         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4872             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4873             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4874             rtm->rtm_type) {
4875                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4876                 return -EINVAL;
4877         }
4878         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4879                 NL_SET_ERR_MSG_MOD(extack,
4880                                    "Invalid flags for get route request");
4881                 return -EINVAL;
4882         }
4883
4884         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4885                                  rtm_ipv6_policy, extack);
4886         if (err)
4887                 return err;
4888
4889         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4890             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4891                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4892                 return -EINVAL;
4893         }
4894
4895         for (i = 0; i <= RTA_MAX; i++) {
4896                 if (!tb[i])
4897                         continue;
4898
4899                 switch (i) {
4900                 case RTA_SRC:
4901                 case RTA_DST:
4902                 case RTA_IIF:
4903                 case RTA_OIF:
4904                 case RTA_MARK:
4905                 case RTA_UID:
4906                 case RTA_SPORT:
4907                 case RTA_DPORT:
4908                 case RTA_IP_PROTO:
4909                         break;
4910                 default:
4911                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4912                         return -EINVAL;
4913                 }
4914         }
4915
4916         return 0;
4917 }
4918
4919 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4920                               struct netlink_ext_ack *extack)
4921 {
4922         struct net *net = sock_net(in_skb->sk);
4923         struct nlattr *tb[RTA_MAX+1];
4924         int err, iif = 0, oif = 0;
4925         struct fib6_info *from;
4926         struct dst_entry *dst;
4927         struct rt6_info *rt;
4928         struct sk_buff *skb;
4929         struct rtmsg *rtm;
4930         struct flowi6 fl6 = {};
4931         bool fibmatch;
4932
4933         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4934         if (err < 0)
4935                 goto errout;
4936
4937         err = -EINVAL;
4938         rtm = nlmsg_data(nlh);
4939         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4940         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4941
4942         if (tb[RTA_SRC]) {
4943                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4944                         goto errout;
4945
4946                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4947         }
4948
4949         if (tb[RTA_DST]) {
4950                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4951                         goto errout;
4952
4953                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4954         }
4955
4956         if (tb[RTA_IIF])
4957                 iif = nla_get_u32(tb[RTA_IIF]);
4958
4959         if (tb[RTA_OIF])
4960                 oif = nla_get_u32(tb[RTA_OIF]);
4961
4962         if (tb[RTA_MARK])
4963                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4964
4965         if (tb[RTA_UID])
4966                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4967                                            nla_get_u32(tb[RTA_UID]));
4968         else
4969                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4970
4971         if (tb[RTA_SPORT])
4972                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4973
4974         if (tb[RTA_DPORT])
4975                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4976
4977         if (tb[RTA_IP_PROTO]) {
4978                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4979                                                   &fl6.flowi6_proto, AF_INET6,
4980                                                   extack);
4981                 if (err)
4982                         goto errout;
4983         }
4984
4985         if (iif) {
4986                 struct net_device *dev;
4987                 int flags = 0;
4988
4989                 rcu_read_lock();
4990
4991                 dev = dev_get_by_index_rcu(net, iif);
4992                 if (!dev) {
4993                         rcu_read_unlock();
4994                         err = -ENODEV;
4995                         goto errout;
4996                 }
4997
4998                 fl6.flowi6_iif = iif;
4999
5000                 if (!ipv6_addr_any(&fl6.saddr))
5001                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5002
5003                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5004
5005                 rcu_read_unlock();
5006         } else {
5007                 fl6.flowi6_oif = oif;
5008
5009                 dst = ip6_route_output(net, NULL, &fl6);
5010         }
5011
5012
5013         rt = container_of(dst, struct rt6_info, dst);
5014         if (rt->dst.error) {
5015                 err = rt->dst.error;
5016                 ip6_rt_put(rt);
5017                 goto errout;
5018         }
5019
5020         if (rt == net->ipv6.ip6_null_entry) {
5021                 err = rt->dst.error;
5022                 ip6_rt_put(rt);
5023                 goto errout;
5024         }
5025
5026         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5027         if (!skb) {
5028                 ip6_rt_put(rt);
5029                 err = -ENOBUFS;
5030                 goto errout;
5031         }
5032
5033         skb_dst_set(skb, &rt->dst);
5034
5035         rcu_read_lock();
5036         from = rcu_dereference(rt->from);
5037
5038         if (fibmatch)
5039                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5040                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5041                                     nlh->nlmsg_seq, 0);
5042         else
5043                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5044                                     &fl6.saddr, iif, RTM_NEWROUTE,
5045                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5046                                     0);
5047         rcu_read_unlock();
5048
5049         if (err < 0) {
5050                 kfree_skb(skb);
5051                 goto errout;
5052         }
5053
5054         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5055 errout:
5056         return err;
5057 }
5058
5059 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5060                      unsigned int nlm_flags)
5061 {
5062         struct sk_buff *skb;
5063         struct net *net = info->nl_net;
5064         u32 seq;
5065         int err;
5066
5067         err = -ENOBUFS;
5068         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5069
5070         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5071         if (!skb)
5072                 goto errout;
5073
5074         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5075                             event, info->portid, seq, nlm_flags);
5076         if (err < 0) {
5077                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5078                 WARN_ON(err == -EMSGSIZE);
5079                 kfree_skb(skb);
5080                 goto errout;
5081         }
5082         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5083                     info->nlh, gfp_any());
5084         return;
5085 errout:
5086         if (err < 0)
5087                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5088 }
5089
5090 static int ip6_route_dev_notify(struct notifier_block *this,
5091                                 unsigned long event, void *ptr)
5092 {
5093         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5094         struct net *net = dev_net(dev);
5095
5096         if (!(dev->flags & IFF_LOOPBACK))
5097                 return NOTIFY_OK;
5098
5099         if (event == NETDEV_REGISTER) {
5100                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5101                 net->ipv6.ip6_null_entry->dst.dev = dev;
5102                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5103 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5104                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5105                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5106                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5107                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5108 #endif
5109          } else if (event == NETDEV_UNREGISTER &&
5110                     dev->reg_state != NETREG_UNREGISTERED) {
5111                 /* NETDEV_UNREGISTER could be fired for multiple times by
5112                  * netdev_wait_allrefs(). Make sure we only call this once.
5113                  */
5114                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5115 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5116                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5117                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5118 #endif
5119         }
5120
5121         return NOTIFY_OK;
5122 }
5123
5124 /*
5125  *      /proc
5126  */
5127
5128 #ifdef CONFIG_PROC_FS
5129 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5130 {
5131         struct net *net = (struct net *)seq->private;
5132         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5133                    net->ipv6.rt6_stats->fib_nodes,
5134                    net->ipv6.rt6_stats->fib_route_nodes,
5135                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5136                    net->ipv6.rt6_stats->fib_rt_entries,
5137                    net->ipv6.rt6_stats->fib_rt_cache,
5138                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5139                    net->ipv6.rt6_stats->fib_discarded_routes);
5140
5141         return 0;
5142 }
5143 #endif  /* CONFIG_PROC_FS */
5144
5145 #ifdef CONFIG_SYSCTL
5146
5147 static
5148 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5149                               void __user *buffer, size_t *lenp, loff_t *ppos)
5150 {
5151         struct net *net;
5152         int delay;
5153         int ret;
5154         if (!write)
5155                 return -EINVAL;
5156
5157         net = (struct net *)ctl->extra1;
5158         delay = net->ipv6.sysctl.flush_delay;
5159         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5160         if (ret)
5161                 return ret;
5162
5163         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5164         return 0;
5165 }
5166
5167 static int zero;
5168 static int one = 1;
5169
5170 static struct ctl_table ipv6_route_table_template[] = {
5171         {
5172                 .procname       =       "flush",
5173                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5174                 .maxlen         =       sizeof(int),
5175                 .mode           =       0200,
5176                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5177         },
5178         {
5179                 .procname       =       "gc_thresh",
5180                 .data           =       &ip6_dst_ops_template.gc_thresh,
5181                 .maxlen         =       sizeof(int),
5182                 .mode           =       0644,
5183                 .proc_handler   =       proc_dointvec,
5184         },
5185         {
5186                 .procname       =       "max_size",
5187                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5188                 .maxlen         =       sizeof(int),
5189                 .mode           =       0644,
5190                 .proc_handler   =       proc_dointvec,
5191         },
5192         {
5193                 .procname       =       "gc_min_interval",
5194                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5195                 .maxlen         =       sizeof(int),
5196                 .mode           =       0644,
5197                 .proc_handler   =       proc_dointvec_jiffies,
5198         },
5199         {
5200                 .procname       =       "gc_timeout",
5201                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5202                 .maxlen         =       sizeof(int),
5203                 .mode           =       0644,
5204                 .proc_handler   =       proc_dointvec_jiffies,
5205         },
5206         {
5207                 .procname       =       "gc_interval",
5208                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5209                 .maxlen         =       sizeof(int),
5210                 .mode           =       0644,
5211                 .proc_handler   =       proc_dointvec_jiffies,
5212         },
5213         {
5214                 .procname       =       "gc_elasticity",
5215                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5216                 .maxlen         =       sizeof(int),
5217                 .mode           =       0644,
5218                 .proc_handler   =       proc_dointvec,
5219         },
5220         {
5221                 .procname       =       "mtu_expires",
5222                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5223                 .maxlen         =       sizeof(int),
5224                 .mode           =       0644,
5225                 .proc_handler   =       proc_dointvec_jiffies,
5226         },
5227         {
5228                 .procname       =       "min_adv_mss",
5229                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5230                 .maxlen         =       sizeof(int),
5231                 .mode           =       0644,
5232                 .proc_handler   =       proc_dointvec,
5233         },
5234         {
5235                 .procname       =       "gc_min_interval_ms",
5236                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5237                 .maxlen         =       sizeof(int),
5238                 .mode           =       0644,
5239                 .proc_handler   =       proc_dointvec_ms_jiffies,
5240         },
5241         {
5242                 .procname       =       "skip_notify_on_dev_down",
5243                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5244                 .maxlen         =       sizeof(int),
5245                 .mode           =       0644,
5246                 .proc_handler   =       proc_dointvec,
5247                 .extra1         =       &zero,
5248                 .extra2         =       &one,
5249         },
5250         { }
5251 };
5252
5253 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5254 {
5255         struct ctl_table *table;
5256
5257         table = kmemdup(ipv6_route_table_template,
5258                         sizeof(ipv6_route_table_template),
5259                         GFP_KERNEL);
5260
5261         if (table) {
5262                 table[0].data = &net->ipv6.sysctl.flush_delay;
5263                 table[0].extra1 = net;
5264                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5265                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5266                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5267                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5268                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5269                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5270                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5271                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5272                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5273                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5274
5275                 /* Don't export sysctls to unprivileged users */
5276                 if (net->user_ns != &init_user_ns)
5277                         table[0].procname = NULL;
5278         }
5279
5280         return table;
5281 }
5282 #endif
5283
5284 static int __net_init ip6_route_net_init(struct net *net)
5285 {
5286         int ret = -ENOMEM;
5287
5288         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5289                sizeof(net->ipv6.ip6_dst_ops));
5290
5291         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5292                 goto out_ip6_dst_ops;
5293
5294         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5295                                             sizeof(*net->ipv6.fib6_null_entry),
5296                                             GFP_KERNEL);
5297         if (!net->ipv6.fib6_null_entry)
5298                 goto out_ip6_dst_entries;
5299
5300         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5301                                            sizeof(*net->ipv6.ip6_null_entry),
5302                                            GFP_KERNEL);
5303         if (!net->ipv6.ip6_null_entry)
5304                 goto out_fib6_null_entry;
5305         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5306         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5307                          ip6_template_metrics, true);
5308
5309 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5310         net->ipv6.fib6_has_custom_rules = false;
5311         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5312                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5313                                                GFP_KERNEL);
5314         if (!net->ipv6.ip6_prohibit_entry)
5315                 goto out_ip6_null_entry;
5316         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5317         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5318                          ip6_template_metrics, true);
5319
5320         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5321                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5322                                                GFP_KERNEL);
5323         if (!net->ipv6.ip6_blk_hole_entry)
5324                 goto out_ip6_prohibit_entry;
5325         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5326         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5327                          ip6_template_metrics, true);
5328 #endif
5329
5330         net->ipv6.sysctl.flush_delay = 0;
5331         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5332         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5333         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5334         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5335         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5336         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5337         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5338         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5339
5340         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5341
5342         ret = 0;
5343 out:
5344         return ret;
5345
5346 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5347 out_ip6_prohibit_entry:
5348         kfree(net->ipv6.ip6_prohibit_entry);
5349 out_ip6_null_entry:
5350         kfree(net->ipv6.ip6_null_entry);
5351 #endif
5352 out_fib6_null_entry:
5353         kfree(net->ipv6.fib6_null_entry);
5354 out_ip6_dst_entries:
5355         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5356 out_ip6_dst_ops:
5357         goto out;
5358 }
5359
5360 static void __net_exit ip6_route_net_exit(struct net *net)
5361 {
5362         kfree(net->ipv6.fib6_null_entry);
5363         kfree(net->ipv6.ip6_null_entry);
5364 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5365         kfree(net->ipv6.ip6_prohibit_entry);
5366         kfree(net->ipv6.ip6_blk_hole_entry);
5367 #endif
5368         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5369 }
5370
5371 static int __net_init ip6_route_net_init_late(struct net *net)
5372 {
5373 #ifdef CONFIG_PROC_FS
5374         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5375                         sizeof(struct ipv6_route_iter));
5376         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5377                         rt6_stats_seq_show, NULL);
5378 #endif
5379         return 0;
5380 }
5381
5382 static void __net_exit ip6_route_net_exit_late(struct net *net)
5383 {
5384 #ifdef CONFIG_PROC_FS
5385         remove_proc_entry("ipv6_route", net->proc_net);
5386         remove_proc_entry("rt6_stats", net->proc_net);
5387 #endif
5388 }
5389
5390 static struct pernet_operations ip6_route_net_ops = {
5391         .init = ip6_route_net_init,
5392         .exit = ip6_route_net_exit,
5393 };
5394
5395 static int __net_init ipv6_inetpeer_init(struct net *net)
5396 {
5397         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5398
5399         if (!bp)
5400                 return -ENOMEM;
5401         inet_peer_base_init(bp);
5402         net->ipv6.peers = bp;
5403         return 0;
5404 }
5405
5406 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5407 {
5408         struct inet_peer_base *bp = net->ipv6.peers;
5409
5410         net->ipv6.peers = NULL;
5411         inetpeer_invalidate_tree(bp);
5412         kfree(bp);
5413 }
5414
5415 static struct pernet_operations ipv6_inetpeer_ops = {
5416         .init   =       ipv6_inetpeer_init,
5417         .exit   =       ipv6_inetpeer_exit,
5418 };
5419
5420 static struct pernet_operations ip6_route_net_late_ops = {
5421         .init = ip6_route_net_init_late,
5422         .exit = ip6_route_net_exit_late,
5423 };
5424
5425 static struct notifier_block ip6_route_dev_notifier = {
5426         .notifier_call = ip6_route_dev_notify,
5427         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5428 };
5429
5430 void __init ip6_route_init_special_entries(void)
5431 {
5432         /* Registering of the loopback is done before this portion of code,
5433          * the loopback reference in rt6_info will not be taken, do it
5434          * manually for init_net */
5435         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5436         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5437         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5438   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5439         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5440         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5441         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5442         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5443   #endif
5444 }
5445
5446 int __init ip6_route_init(void)
5447 {
5448         int ret;
5449         int cpu;
5450
5451         ret = -ENOMEM;
5452         ip6_dst_ops_template.kmem_cachep =
5453                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5454                                   SLAB_HWCACHE_ALIGN, NULL);
5455         if (!ip6_dst_ops_template.kmem_cachep)
5456                 goto out;
5457
5458         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5459         if (ret)
5460                 goto out_kmem_cache;
5461
5462         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5463         if (ret)
5464                 goto out_dst_entries;
5465
5466         ret = register_pernet_subsys(&ip6_route_net_ops);
5467         if (ret)
5468                 goto out_register_inetpeer;
5469
5470         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5471
5472         ret = fib6_init();
5473         if (ret)
5474                 goto out_register_subsys;
5475
5476         ret = xfrm6_init();
5477         if (ret)
5478                 goto out_fib6_init;
5479
5480         ret = fib6_rules_init();
5481         if (ret)
5482                 goto xfrm6_init;
5483
5484         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5485         if (ret)
5486                 goto fib6_rules_init;
5487
5488         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5489                                    inet6_rtm_newroute, NULL, 0);
5490         if (ret < 0)
5491                 goto out_register_late_subsys;
5492
5493         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5494                                    inet6_rtm_delroute, NULL, 0);
5495         if (ret < 0)
5496                 goto out_register_late_subsys;
5497
5498         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5499                                    inet6_rtm_getroute, NULL,
5500                                    RTNL_FLAG_DOIT_UNLOCKED);
5501         if (ret < 0)
5502                 goto out_register_late_subsys;
5503
5504         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5505         if (ret)
5506                 goto out_register_late_subsys;
5507
5508         for_each_possible_cpu(cpu) {
5509                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5510
5511                 INIT_LIST_HEAD(&ul->head);
5512                 spin_lock_init(&ul->lock);
5513         }
5514
5515 out:
5516         return ret;
5517
5518 out_register_late_subsys:
5519         rtnl_unregister_all(PF_INET6);
5520         unregister_pernet_subsys(&ip6_route_net_late_ops);
5521 fib6_rules_init:
5522         fib6_rules_cleanup();
5523 xfrm6_init:
5524         xfrm6_fini();
5525 out_fib6_init:
5526         fib6_gc_cleanup();
5527 out_register_subsys:
5528         unregister_pernet_subsys(&ip6_route_net_ops);
5529 out_register_inetpeer:
5530         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5531 out_dst_entries:
5532         dst_entries_destroy(&ip6_dst_blackhole_ops);
5533 out_kmem_cache:
5534         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5535         goto out;
5536 }
5537
5538 void ip6_route_cleanup(void)
5539 {
5540         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5541         unregister_pernet_subsys(&ip6_route_net_late_ops);
5542         fib6_rules_cleanup();
5543         xfrm6_fini();
5544         fib6_gc_cleanup();
5545         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5546         unregister_pernet_subsys(&ip6_route_net_ops);
5547         dst_entries_destroy(&ip6_dst_blackhole_ops);
5548         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5549 }