]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
5f453c79dd00a059de97c8c762394d0756d1a53e
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (net) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         if (!fib6_info_hold_safe(rt))
1042                 goto fallback;
1043
1044         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1045         if (!nrt) {
1046                 fib6_info_release(rt);
1047                 goto fallback;
1048         }
1049
1050         ip6_rt_copy_init(nrt, rt);
1051         return nrt;
1052
1053 fallback:
1054         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1055         dst_hold(&nrt->dst);
1056         return nrt;
1057 }
1058
1059 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1060                                              struct fib6_table *table,
1061                                              struct flowi6 *fl6,
1062                                              const struct sk_buff *skb,
1063                                              int flags)
1064 {
1065         struct fib6_info *f6i;
1066         struct fib6_node *fn;
1067         struct rt6_info *rt;
1068
1069         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1070                 flags &= ~RT6_LOOKUP_F_IFACE;
1071
1072         rcu_read_lock();
1073         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1074 restart:
1075         f6i = rcu_dereference(fn->leaf);
1076         if (!f6i) {
1077                 f6i = net->ipv6.fib6_null_entry;
1078         } else {
1079                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1080                                       fl6->flowi6_oif, flags);
1081                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1082                         f6i = fib6_multipath_select(net, f6i, fl6,
1083                                                     fl6->flowi6_oif, skb,
1084                                                     flags);
1085         }
1086         if (f6i == net->ipv6.fib6_null_entry) {
1087                 fn = fib6_backtrack(fn, &fl6->saddr);
1088                 if (fn)
1089                         goto restart;
1090         }
1091
1092         trace_fib6_table_lookup(net, f6i, table, fl6);
1093
1094         /* Search through exception table */
1095         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1096         if (rt) {
1097                 if (ip6_hold_safe(net, &rt))
1098                         dst_use_noref(&rt->dst, jiffies);
1099         } else if (f6i == net->ipv6.fib6_null_entry) {
1100                 rt = net->ipv6.ip6_null_entry;
1101                 dst_hold(&rt->dst);
1102         } else {
1103                 rt = ip6_create_rt_rcu(f6i);
1104         }
1105
1106         rcu_read_unlock();
1107
1108         return rt;
1109 }
1110
1111 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1112                                    const struct sk_buff *skb, int flags)
1113 {
1114         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 }
1116 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117
1118 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1119                             const struct in6_addr *saddr, int oif,
1120                             const struct sk_buff *skb, int strict)
1121 {
1122         struct flowi6 fl6 = {
1123                 .flowi6_oif = oif,
1124                 .daddr = *daddr,
1125         };
1126         struct dst_entry *dst;
1127         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1128
1129         if (saddr) {
1130                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1131                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1132         }
1133
1134         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1135         if (dst->error == 0)
1136                 return (struct rt6_info *) dst;
1137
1138         dst_release(dst);
1139
1140         return NULL;
1141 }
1142 EXPORT_SYMBOL(rt6_lookup);
1143
1144 /* ip6_ins_rt is called with FREE table->tb6_lock.
1145  * It takes new route entry, the addition fails by any reason the
1146  * route is released.
1147  * Caller must hold dst before calling it.
1148  */
1149
1150 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1151                         struct netlink_ext_ack *extack)
1152 {
1153         int err;
1154         struct fib6_table *table;
1155
1156         table = rt->fib6_table;
1157         spin_lock_bh(&table->tb6_lock);
1158         err = fib6_add(&table->tb6_root, rt, info, extack);
1159         spin_unlock_bh(&table->tb6_lock);
1160
1161         return err;
1162 }
1163
1164 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 {
1166         struct nl_info info = { .nl_net = net, };
1167
1168         return __ip6_ins_rt(rt, &info, NULL);
1169 }
1170
1171 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1172                                            const struct in6_addr *daddr,
1173                                            const struct in6_addr *saddr)
1174 {
1175         struct net_device *dev;
1176         struct rt6_info *rt;
1177
1178         /*
1179          *      Clone the route.
1180          */
1181
1182         if (!fib6_info_hold_safe(ort))
1183                 return NULL;
1184
1185         dev = ip6_rt_get_dev_rcu(ort);
1186         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1187         if (!rt) {
1188                 fib6_info_release(ort);
1189                 return NULL;
1190         }
1191
1192         ip6_rt_copy_init(rt, ort);
1193         rt->rt6i_flags |= RTF_CACHE;
1194         rt->dst.flags |= DST_HOST;
1195         rt->rt6i_dst.addr = *daddr;
1196         rt->rt6i_dst.plen = 128;
1197
1198         if (!rt6_is_gw_or_nonexthop(ort)) {
1199                 if (ort->fib6_dst.plen != 128 &&
1200                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1201                         rt->rt6i_flags |= RTF_ANYCAST;
1202 #ifdef CONFIG_IPV6_SUBTREES
1203                 if (rt->rt6i_src.plen && saddr) {
1204                         rt->rt6i_src.addr = *saddr;
1205                         rt->rt6i_src.plen = 128;
1206                 }
1207 #endif
1208         }
1209
1210         return rt;
1211 }
1212
1213 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1214 {
1215         unsigned short flags = fib6_info_dst_flags(rt);
1216         struct net_device *dev;
1217         struct rt6_info *pcpu_rt;
1218
1219         if (!fib6_info_hold_safe(rt))
1220                 return NULL;
1221
1222         rcu_read_lock();
1223         dev = ip6_rt_get_dev_rcu(rt);
1224         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1225         rcu_read_unlock();
1226         if (!pcpu_rt) {
1227                 fib6_info_release(rt);
1228                 return NULL;
1229         }
1230         ip6_rt_copy_init(pcpu_rt, rt);
1231         pcpu_rt->rt6i_flags |= RTF_PCPU;
1232         return pcpu_rt;
1233 }
1234
1235 /* It should be called with rcu_read_lock() acquired */
1236 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1237 {
1238         struct rt6_info *pcpu_rt, **p;
1239
1240         p = this_cpu_ptr(rt->rt6i_pcpu);
1241         pcpu_rt = *p;
1242
1243         if (pcpu_rt)
1244                 ip6_hold_safe(NULL, &pcpu_rt);
1245
1246         return pcpu_rt;
1247 }
1248
1249 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1250                                             struct fib6_info *rt)
1251 {
1252         struct rt6_info *pcpu_rt, *prev, **p;
1253
1254         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255         if (!pcpu_rt) {
1256                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1257                 return net->ipv6.ip6_null_entry;
1258         }
1259
1260         dst_hold(&pcpu_rt->dst);
1261         p = this_cpu_ptr(rt->rt6i_pcpu);
1262         prev = cmpxchg(p, NULL, pcpu_rt);
1263         BUG_ON(prev);
1264
1265         return pcpu_rt;
1266 }
1267
1268 /* exception hash table implementation
1269  */
1270 static DEFINE_SPINLOCK(rt6_exception_lock);
1271
1272 /* Remove rt6_ex from hash table and free the memory
1273  * Caller must hold rt6_exception_lock
1274  */
1275 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1276                                  struct rt6_exception *rt6_ex)
1277 {
1278         struct fib6_info *from;
1279         struct net *net;
1280
1281         if (!bucket || !rt6_ex)
1282                 return;
1283
1284         net = dev_net(rt6_ex->rt6i->dst.dev);
1285         net->ipv6.rt6_stats->fib_rt_cache--;
1286
1287         /* purge completely the exception to allow releasing the held resources:
1288          * some [sk] cache may keep the dst around for unlimited time
1289          */
1290         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1291                                          lockdep_is_held(&rt6_exception_lock));
1292         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1293         fib6_info_release(from);
1294         dst_dev_put(&rt6_ex->rt6i->dst);
1295
1296         hlist_del_rcu(&rt6_ex->hlist);
1297         dst_release(&rt6_ex->rt6i->dst);
1298         kfree_rcu(rt6_ex, rcu);
1299         WARN_ON_ONCE(!bucket->depth);
1300         bucket->depth--;
1301 }
1302
1303 /* Remove oldest rt6_ex in bucket and free the memory
1304  * Caller must hold rt6_exception_lock
1305  */
1306 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1307 {
1308         struct rt6_exception *rt6_ex, *oldest = NULL;
1309
1310         if (!bucket)
1311                 return;
1312
1313         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1314                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1315                         oldest = rt6_ex;
1316         }
1317         rt6_remove_exception(bucket, oldest);
1318 }
1319
1320 static u32 rt6_exception_hash(const struct in6_addr *dst,
1321                               const struct in6_addr *src)
1322 {
1323         static u32 seed __read_mostly;
1324         u32 val;
1325
1326         net_get_random_once(&seed, sizeof(seed));
1327         val = jhash(dst, sizeof(*dst), seed);
1328
1329 #ifdef CONFIG_IPV6_SUBTREES
1330         if (src)
1331                 val = jhash(src, sizeof(*src), val);
1332 #endif
1333         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1334 }
1335
1336 /* Helper function to find the cached rt in the hash table
1337  * and update bucket pointer to point to the bucket for this
1338  * (daddr, saddr) pair
1339  * Caller must hold rt6_exception_lock
1340  */
1341 static struct rt6_exception *
1342 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1343                               const struct in6_addr *daddr,
1344                               const struct in6_addr *saddr)
1345 {
1346         struct rt6_exception *rt6_ex;
1347         u32 hval;
1348
1349         if (!(*bucket) || !daddr)
1350                 return NULL;
1351
1352         hval = rt6_exception_hash(daddr, saddr);
1353         *bucket += hval;
1354
1355         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1356                 struct rt6_info *rt6 = rt6_ex->rt6i;
1357                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1358
1359 #ifdef CONFIG_IPV6_SUBTREES
1360                 if (matched && saddr)
1361                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1362 #endif
1363                 if (matched)
1364                         return rt6_ex;
1365         }
1366         return NULL;
1367 }
1368
1369 /* Helper function to find the cached rt in the hash table
1370  * and update bucket pointer to point to the bucket for this
1371  * (daddr, saddr) pair
1372  * Caller must hold rcu_read_lock()
1373  */
1374 static struct rt6_exception *
1375 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1376                          const struct in6_addr *daddr,
1377                          const struct in6_addr *saddr)
1378 {
1379         struct rt6_exception *rt6_ex;
1380         u32 hval;
1381
1382         WARN_ON_ONCE(!rcu_read_lock_held());
1383
1384         if (!(*bucket) || !daddr)
1385                 return NULL;
1386
1387         hval = rt6_exception_hash(daddr, saddr);
1388         *bucket += hval;
1389
1390         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1391                 struct rt6_info *rt6 = rt6_ex->rt6i;
1392                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1393
1394 #ifdef CONFIG_IPV6_SUBTREES
1395                 if (matched && saddr)
1396                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1397 #endif
1398                 if (matched)
1399                         return rt6_ex;
1400         }
1401         return NULL;
1402 }
1403
1404 static unsigned int fib6_mtu(const struct fib6_info *rt)
1405 {
1406         unsigned int mtu;
1407
1408         if (rt->fib6_pmtu) {
1409                 mtu = rt->fib6_pmtu;
1410         } else {
1411                 struct net_device *dev = fib6_info_nh_dev(rt);
1412                 struct inet6_dev *idev;
1413
1414                 rcu_read_lock();
1415                 idev = __in6_dev_get(dev);
1416                 mtu = idev->cnf.mtu6;
1417                 rcu_read_unlock();
1418         }
1419
1420         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1421
1422         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1423 }
1424
1425 static int rt6_insert_exception(struct rt6_info *nrt,
1426                                 struct fib6_info *ort)
1427 {
1428         struct net *net = dev_net(nrt->dst.dev);
1429         struct rt6_exception_bucket *bucket;
1430         struct in6_addr *src_key = NULL;
1431         struct rt6_exception *rt6_ex;
1432         int err = 0;
1433
1434         spin_lock_bh(&rt6_exception_lock);
1435
1436         if (ort->exception_bucket_flushed) {
1437                 err = -EINVAL;
1438                 goto out;
1439         }
1440
1441         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1442                                         lockdep_is_held(&rt6_exception_lock));
1443         if (!bucket) {
1444                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1445                                  GFP_ATOMIC);
1446                 if (!bucket) {
1447                         err = -ENOMEM;
1448                         goto out;
1449                 }
1450                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1451         }
1452
1453 #ifdef CONFIG_IPV6_SUBTREES
1454         /* rt6i_src.plen != 0 indicates ort is in subtree
1455          * and exception table is indexed by a hash of
1456          * both rt6i_dst and rt6i_src.
1457          * Otherwise, the exception table is indexed by
1458          * a hash of only rt6i_dst.
1459          */
1460         if (ort->fib6_src.plen)
1461                 src_key = &nrt->rt6i_src.addr;
1462 #endif
1463         /* rt6_mtu_change() might lower mtu on ort.
1464          * Only insert this exception route if its mtu
1465          * is less than ort's mtu value.
1466          */
1467         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1468                 err = -EINVAL;
1469                 goto out;
1470         }
1471
1472         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1473                                                src_key);
1474         if (rt6_ex)
1475                 rt6_remove_exception(bucket, rt6_ex);
1476
1477         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1478         if (!rt6_ex) {
1479                 err = -ENOMEM;
1480                 goto out;
1481         }
1482         rt6_ex->rt6i = nrt;
1483         rt6_ex->stamp = jiffies;
1484         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1485         bucket->depth++;
1486         net->ipv6.rt6_stats->fib_rt_cache++;
1487
1488         if (bucket->depth > FIB6_MAX_DEPTH)
1489                 rt6_exception_remove_oldest(bucket);
1490
1491 out:
1492         spin_unlock_bh(&rt6_exception_lock);
1493
1494         /* Update fn->fn_sernum to invalidate all cached dst */
1495         if (!err) {
1496                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1497                 fib6_update_sernum(net, ort);
1498                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1499                 fib6_force_start_gc(net);
1500         }
1501
1502         return err;
1503 }
1504
1505 void rt6_flush_exceptions(struct fib6_info *rt)
1506 {
1507         struct rt6_exception_bucket *bucket;
1508         struct rt6_exception *rt6_ex;
1509         struct hlist_node *tmp;
1510         int i;
1511
1512         spin_lock_bh(&rt6_exception_lock);
1513         /* Prevent rt6_insert_exception() to recreate the bucket list */
1514         rt->exception_bucket_flushed = 1;
1515
1516         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1517                                     lockdep_is_held(&rt6_exception_lock));
1518         if (!bucket)
1519                 goto out;
1520
1521         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1522                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1523                         rt6_remove_exception(bucket, rt6_ex);
1524                 WARN_ON_ONCE(bucket->depth);
1525                 bucket++;
1526         }
1527
1528 out:
1529         spin_unlock_bh(&rt6_exception_lock);
1530 }
1531
1532 /* Find cached rt in the hash table inside passed in rt
1533  * Caller has to hold rcu_read_lock()
1534  */
1535 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1536                                            struct in6_addr *daddr,
1537                                            struct in6_addr *saddr)
1538 {
1539         struct rt6_exception_bucket *bucket;
1540         struct in6_addr *src_key = NULL;
1541         struct rt6_exception *rt6_ex;
1542         struct rt6_info *res = NULL;
1543
1544         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1545
1546 #ifdef CONFIG_IPV6_SUBTREES
1547         /* rt6i_src.plen != 0 indicates rt is in subtree
1548          * and exception table is indexed by a hash of
1549          * both rt6i_dst and rt6i_src.
1550          * Otherwise, the exception table is indexed by
1551          * a hash of only rt6i_dst.
1552          */
1553         if (rt->fib6_src.plen)
1554                 src_key = saddr;
1555 #endif
1556         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1557
1558         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1559                 res = rt6_ex->rt6i;
1560
1561         return res;
1562 }
1563
1564 /* Remove the passed in cached rt from the hash table that contains it */
1565 static int rt6_remove_exception_rt(struct rt6_info *rt)
1566 {
1567         struct rt6_exception_bucket *bucket;
1568         struct in6_addr *src_key = NULL;
1569         struct rt6_exception *rt6_ex;
1570         struct fib6_info *from;
1571         int err;
1572
1573         from = rcu_dereference(rt->from);
1574         if (!from ||
1575             !(rt->rt6i_flags & RTF_CACHE))
1576                 return -EINVAL;
1577
1578         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1579                 return -ENOENT;
1580
1581         spin_lock_bh(&rt6_exception_lock);
1582         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1583                                     lockdep_is_held(&rt6_exception_lock));
1584 #ifdef CONFIG_IPV6_SUBTREES
1585         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1586          * and exception table is indexed by a hash of
1587          * both rt6i_dst and rt6i_src.
1588          * Otherwise, the exception table is indexed by
1589          * a hash of only rt6i_dst.
1590          */
1591         if (from->fib6_src.plen)
1592                 src_key = &rt->rt6i_src.addr;
1593 #endif
1594         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1595                                                &rt->rt6i_dst.addr,
1596                                                src_key);
1597         if (rt6_ex) {
1598                 rt6_remove_exception(bucket, rt6_ex);
1599                 err = 0;
1600         } else {
1601                 err = -ENOENT;
1602         }
1603
1604         spin_unlock_bh(&rt6_exception_lock);
1605         return err;
1606 }
1607
1608 /* Find rt6_ex which contains the passed in rt cache and
1609  * refresh its stamp
1610  */
1611 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1612 {
1613         struct rt6_exception_bucket *bucket;
1614         struct in6_addr *src_key = NULL;
1615         struct rt6_exception *rt6_ex;
1616         struct fib6_info *from;
1617
1618         rcu_read_lock();
1619         from = rcu_dereference(rt->from);
1620         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1621                 goto unlock;
1622
1623         bucket = rcu_dereference(from->rt6i_exception_bucket);
1624
1625 #ifdef CONFIG_IPV6_SUBTREES
1626         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1627          * and exception table is indexed by a hash of
1628          * both rt6i_dst and rt6i_src.
1629          * Otherwise, the exception table is indexed by
1630          * a hash of only rt6i_dst.
1631          */
1632         if (from->fib6_src.plen)
1633                 src_key = &rt->rt6i_src.addr;
1634 #endif
1635         rt6_ex = __rt6_find_exception_rcu(&bucket,
1636                                           &rt->rt6i_dst.addr,
1637                                           src_key);
1638         if (rt6_ex)
1639                 rt6_ex->stamp = jiffies;
1640
1641 unlock:
1642         rcu_read_unlock();
1643 }
1644
1645 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1646                                          struct rt6_info *rt, int mtu)
1647 {
1648         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1649          * lowest MTU in the path: always allow updating the route PMTU to
1650          * reflect PMTU decreases.
1651          *
1652          * If the new MTU is higher, and the route PMTU is equal to the local
1653          * MTU, this means the old MTU is the lowest in the path, so allow
1654          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1655          * handle this.
1656          */
1657
1658         if (dst_mtu(&rt->dst) >= mtu)
1659                 return true;
1660
1661         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1662                 return true;
1663
1664         return false;
1665 }
1666
1667 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1668                                        struct fib6_info *rt, int mtu)
1669 {
1670         struct rt6_exception_bucket *bucket;
1671         struct rt6_exception *rt6_ex;
1672         int i;
1673
1674         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1675                                         lockdep_is_held(&rt6_exception_lock));
1676
1677         if (!bucket)
1678                 return;
1679
1680         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1681                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1682                         struct rt6_info *entry = rt6_ex->rt6i;
1683
1684                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1685                          * route), the metrics of its rt->from have already
1686                          * been updated.
1687                          */
1688                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1689                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1690                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1691                 }
1692                 bucket++;
1693         }
1694 }
1695
1696 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1697
1698 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1699                                         struct in6_addr *gateway)
1700 {
1701         struct rt6_exception_bucket *bucket;
1702         struct rt6_exception *rt6_ex;
1703         struct hlist_node *tmp;
1704         int i;
1705
1706         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1707                 return;
1708
1709         spin_lock_bh(&rt6_exception_lock);
1710         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1711                                      lockdep_is_held(&rt6_exception_lock));
1712
1713         if (bucket) {
1714                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1715                         hlist_for_each_entry_safe(rt6_ex, tmp,
1716                                                   &bucket->chain, hlist) {
1717                                 struct rt6_info *entry = rt6_ex->rt6i;
1718
1719                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1720                                     RTF_CACHE_GATEWAY &&
1721                                     ipv6_addr_equal(gateway,
1722                                                     &entry->rt6i_gateway)) {
1723                                         rt6_remove_exception(bucket, rt6_ex);
1724                                 }
1725                         }
1726                         bucket++;
1727                 }
1728         }
1729
1730         spin_unlock_bh(&rt6_exception_lock);
1731 }
1732
1733 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1734                                       struct rt6_exception *rt6_ex,
1735                                       struct fib6_gc_args *gc_args,
1736                                       unsigned long now)
1737 {
1738         struct rt6_info *rt = rt6_ex->rt6i;
1739
1740         /* we are pruning and obsoleting aged-out and non gateway exceptions
1741          * even if others have still references to them, so that on next
1742          * dst_check() such references can be dropped.
1743          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1744          * expired, independently from their aging, as per RFC 8201 section 4
1745          */
1746         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1747                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1748                         RT6_TRACE("aging clone %p\n", rt);
1749                         rt6_remove_exception(bucket, rt6_ex);
1750                         return;
1751                 }
1752         } else if (time_after(jiffies, rt->dst.expires)) {
1753                 RT6_TRACE("purging expired route %p\n", rt);
1754                 rt6_remove_exception(bucket, rt6_ex);
1755                 return;
1756         }
1757
1758         if (rt->rt6i_flags & RTF_GATEWAY) {
1759                 struct neighbour *neigh;
1760                 __u8 neigh_flags = 0;
1761
1762                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1763                 if (neigh)
1764                         neigh_flags = neigh->flags;
1765
1766                 if (!(neigh_flags & NTF_ROUTER)) {
1767                         RT6_TRACE("purging route %p via non-router but gateway\n",
1768                                   rt);
1769                         rt6_remove_exception(bucket, rt6_ex);
1770                         return;
1771                 }
1772         }
1773
1774         gc_args->more++;
1775 }
1776
1777 void rt6_age_exceptions(struct fib6_info *rt,
1778                         struct fib6_gc_args *gc_args,
1779                         unsigned long now)
1780 {
1781         struct rt6_exception_bucket *bucket;
1782         struct rt6_exception *rt6_ex;
1783         struct hlist_node *tmp;
1784         int i;
1785
1786         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1787                 return;
1788
1789         rcu_read_lock_bh();
1790         spin_lock(&rt6_exception_lock);
1791         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1792                                     lockdep_is_held(&rt6_exception_lock));
1793
1794         if (bucket) {
1795                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1796                         hlist_for_each_entry_safe(rt6_ex, tmp,
1797                                                   &bucket->chain, hlist) {
1798                                 rt6_age_examine_exception(bucket, rt6_ex,
1799                                                           gc_args, now);
1800                         }
1801                         bucket++;
1802                 }
1803         }
1804         spin_unlock(&rt6_exception_lock);
1805         rcu_read_unlock_bh();
1806 }
1807
1808 /* must be called with rcu lock held */
1809 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1810                                     int oif, struct flowi6 *fl6, int strict)
1811 {
1812         struct fib6_node *fn, *saved_fn;
1813         struct fib6_info *f6i;
1814
1815         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1816         saved_fn = fn;
1817
1818         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1819                 oif = 0;
1820
1821 redo_rt6_select:
1822         f6i = rt6_select(net, fn, oif, strict);
1823         if (f6i == net->ipv6.fib6_null_entry) {
1824                 fn = fib6_backtrack(fn, &fl6->saddr);
1825                 if (fn)
1826                         goto redo_rt6_select;
1827                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1828                         /* also consider unreachable route */
1829                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1830                         fn = saved_fn;
1831                         goto redo_rt6_select;
1832                 }
1833         }
1834
1835         trace_fib6_table_lookup(net, f6i, table, fl6);
1836
1837         return f6i;
1838 }
1839
1840 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1841                                int oif, struct flowi6 *fl6,
1842                                const struct sk_buff *skb, int flags)
1843 {
1844         struct fib6_info *f6i;
1845         struct rt6_info *rt;
1846         int strict = 0;
1847
1848         strict |= flags & RT6_LOOKUP_F_IFACE;
1849         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1850         if (net->ipv6.devconf_all->forwarding == 0)
1851                 strict |= RT6_LOOKUP_F_REACHABLE;
1852
1853         rcu_read_lock();
1854
1855         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1856         if (f6i->fib6_nsiblings)
1857                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1858
1859         if (f6i == net->ipv6.fib6_null_entry) {
1860                 rt = net->ipv6.ip6_null_entry;
1861                 rcu_read_unlock();
1862                 dst_hold(&rt->dst);
1863                 return rt;
1864         }
1865
1866         /*Search through exception table */
1867         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1868         if (rt) {
1869                 if (ip6_hold_safe(net, &rt))
1870                         dst_use_noref(&rt->dst, jiffies);
1871
1872                 rcu_read_unlock();
1873                 return rt;
1874         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1875                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1876                 /* Create a RTF_CACHE clone which will not be
1877                  * owned by the fib6 tree.  It is for the special case where
1878                  * the daddr in the skb during the neighbor look-up is different
1879                  * from the fl6->daddr used to look-up route here.
1880                  */
1881                 struct rt6_info *uncached_rt;
1882
1883                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1884
1885                 rcu_read_unlock();
1886
1887                 if (uncached_rt) {
1888                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1889                          * No need for another dst_hold()
1890                          */
1891                         rt6_uncached_list_add(uncached_rt);
1892                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1893                 } else {
1894                         uncached_rt = net->ipv6.ip6_null_entry;
1895                         dst_hold(&uncached_rt->dst);
1896                 }
1897
1898                 return uncached_rt;
1899         } else {
1900                 /* Get a percpu copy */
1901
1902                 struct rt6_info *pcpu_rt;
1903
1904                 local_bh_disable();
1905                 pcpu_rt = rt6_get_pcpu_route(f6i);
1906
1907                 if (!pcpu_rt)
1908                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1909
1910                 local_bh_enable();
1911                 rcu_read_unlock();
1912
1913                 return pcpu_rt;
1914         }
1915 }
1916 EXPORT_SYMBOL_GPL(ip6_pol_route);
1917
1918 static struct rt6_info *ip6_pol_route_input(struct net *net,
1919                                             struct fib6_table *table,
1920                                             struct flowi6 *fl6,
1921                                             const struct sk_buff *skb,
1922                                             int flags)
1923 {
1924         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1925 }
1926
1927 struct dst_entry *ip6_route_input_lookup(struct net *net,
1928                                          struct net_device *dev,
1929                                          struct flowi6 *fl6,
1930                                          const struct sk_buff *skb,
1931                                          int flags)
1932 {
1933         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1934                 flags |= RT6_LOOKUP_F_IFACE;
1935
1936         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1937 }
1938 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1939
1940 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1941                                   struct flow_keys *keys,
1942                                   struct flow_keys *flkeys)
1943 {
1944         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1945         const struct ipv6hdr *key_iph = outer_iph;
1946         struct flow_keys *_flkeys = flkeys;
1947         const struct ipv6hdr *inner_iph;
1948         const struct icmp6hdr *icmph;
1949         struct ipv6hdr _inner_iph;
1950         struct icmp6hdr _icmph;
1951
1952         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1953                 goto out;
1954
1955         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1956                                    sizeof(_icmph), &_icmph);
1957         if (!icmph)
1958                 goto out;
1959
1960         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1961             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1962             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1963             icmph->icmp6_type != ICMPV6_PARAMPROB)
1964                 goto out;
1965
1966         inner_iph = skb_header_pointer(skb,
1967                                        skb_transport_offset(skb) + sizeof(*icmph),
1968                                        sizeof(_inner_iph), &_inner_iph);
1969         if (!inner_iph)
1970                 goto out;
1971
1972         key_iph = inner_iph;
1973         _flkeys = NULL;
1974 out:
1975         if (_flkeys) {
1976                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1977                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1978                 keys->tags.flow_label = _flkeys->tags.flow_label;
1979                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1980         } else {
1981                 keys->addrs.v6addrs.src = key_iph->saddr;
1982                 keys->addrs.v6addrs.dst = key_iph->daddr;
1983                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1984                 keys->basic.ip_proto = key_iph->nexthdr;
1985         }
1986 }
1987
1988 /* if skb is set it will be used and fl6 can be NULL */
1989 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1990                        const struct sk_buff *skb, struct flow_keys *flkeys)
1991 {
1992         struct flow_keys hash_keys;
1993         u32 mhash;
1994
1995         switch (ip6_multipath_hash_policy(net)) {
1996         case 0:
1997                 memset(&hash_keys, 0, sizeof(hash_keys));
1998                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1999                 if (skb) {
2000                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2001                 } else {
2002                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2003                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2004                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2005                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2006                 }
2007                 break;
2008         case 1:
2009                 if (skb) {
2010                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2011                         struct flow_keys keys;
2012
2013                         /* short-circuit if we already have L4 hash present */
2014                         if (skb->l4_hash)
2015                                 return skb_get_hash_raw(skb) >> 1;
2016
2017                         memset(&hash_keys, 0, sizeof(hash_keys));
2018
2019                         if (!flkeys) {
2020                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2021                                 flkeys = &keys;
2022                         }
2023                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2024                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2025                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2026                         hash_keys.ports.src = flkeys->ports.src;
2027                         hash_keys.ports.dst = flkeys->ports.dst;
2028                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2029                 } else {
2030                         memset(&hash_keys, 0, sizeof(hash_keys));
2031                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2032                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2033                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2034                         hash_keys.ports.src = fl6->fl6_sport;
2035                         hash_keys.ports.dst = fl6->fl6_dport;
2036                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2037                 }
2038                 break;
2039         }
2040         mhash = flow_hash_from_keys(&hash_keys);
2041
2042         return mhash >> 1;
2043 }
2044
2045 void ip6_route_input(struct sk_buff *skb)
2046 {
2047         const struct ipv6hdr *iph = ipv6_hdr(skb);
2048         struct net *net = dev_net(skb->dev);
2049         int flags = RT6_LOOKUP_F_HAS_SADDR;
2050         struct ip_tunnel_info *tun_info;
2051         struct flowi6 fl6 = {
2052                 .flowi6_iif = skb->dev->ifindex,
2053                 .daddr = iph->daddr,
2054                 .saddr = iph->saddr,
2055                 .flowlabel = ip6_flowinfo(iph),
2056                 .flowi6_mark = skb->mark,
2057                 .flowi6_proto = iph->nexthdr,
2058         };
2059         struct flow_keys *flkeys = NULL, _flkeys;
2060
2061         tun_info = skb_tunnel_info(skb);
2062         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2063                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2064
2065         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2066                 flkeys = &_flkeys;
2067
2068         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2069                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2070         skb_dst_drop(skb);
2071         skb_dst_set(skb,
2072                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2073 }
2074
2075 static struct rt6_info *ip6_pol_route_output(struct net *net,
2076                                              struct fib6_table *table,
2077                                              struct flowi6 *fl6,
2078                                              const struct sk_buff *skb,
2079                                              int flags)
2080 {
2081         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2082 }
2083
2084 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2085                                          struct flowi6 *fl6, int flags)
2086 {
2087         bool any_src;
2088
2089         if (ipv6_addr_type(&fl6->daddr) &
2090             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2091                 struct dst_entry *dst;
2092
2093                 dst = l3mdev_link_scope_lookup(net, fl6);
2094                 if (dst)
2095                         return dst;
2096         }
2097
2098         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2099
2100         any_src = ipv6_addr_any(&fl6->saddr);
2101         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2102             (fl6->flowi6_oif && any_src))
2103                 flags |= RT6_LOOKUP_F_IFACE;
2104
2105         if (!any_src)
2106                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2107         else if (sk)
2108                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2109
2110         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2111 }
2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2113
2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2115 {
2116         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2117         struct net_device *loopback_dev = net->loopback_dev;
2118         struct dst_entry *new = NULL;
2119
2120         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2121                        DST_OBSOLETE_DEAD, 0);
2122         if (rt) {
2123                 rt6_info_init(rt);
2124                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2125
2126                 new = &rt->dst;
2127                 new->__use = 1;
2128                 new->input = dst_discard;
2129                 new->output = dst_discard_out;
2130
2131                 dst_copy_metrics(new, &ort->dst);
2132
2133                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2134                 rt->rt6i_gateway = ort->rt6i_gateway;
2135                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2136
2137                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2138 #ifdef CONFIG_IPV6_SUBTREES
2139                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2140 #endif
2141         }
2142
2143         dst_release(dst_orig);
2144         return new ? new : ERR_PTR(-ENOMEM);
2145 }
2146
2147 /*
2148  *      Destination cache support functions
2149  */
2150
2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2152 {
2153         u32 rt_cookie = 0;
2154
2155         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2156                 return false;
2157
2158         if (fib6_check_expired(f6i))
2159                 return false;
2160
2161         return true;
2162 }
2163
2164 static struct dst_entry *rt6_check(struct rt6_info *rt,
2165                                    struct fib6_info *from,
2166                                    u32 cookie)
2167 {
2168         u32 rt_cookie = 0;
2169
2170         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171             rt_cookie != cookie)
2172                 return NULL;
2173
2174         if (rt6_check_expired(rt))
2175                 return NULL;
2176
2177         return &rt->dst;
2178 }
2179
2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181                                             struct fib6_info *from,
2182                                             u32 cookie)
2183 {
2184         if (!__rt6_check_expired(rt) &&
2185             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2186             fib6_check(from, cookie))
2187                 return &rt->dst;
2188         else
2189                 return NULL;
2190 }
2191
2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2193 {
2194         struct dst_entry *dst_ret;
2195         struct fib6_info *from;
2196         struct rt6_info *rt;
2197
2198         rt = container_of(dst, struct rt6_info, dst);
2199
2200         rcu_read_lock();
2201
2202         /* All IPV6 dsts are created with ->obsolete set to the value
2203          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2204          * into this function always.
2205          */
2206
2207         from = rcu_dereference(rt->from);
2208
2209         if (from && (rt->rt6i_flags & RTF_PCPU ||
2210             unlikely(!list_empty(&rt->rt6i_uncached))))
2211                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2212         else
2213                 dst_ret = rt6_check(rt, from, cookie);
2214
2215         rcu_read_unlock();
2216
2217         return dst_ret;
2218 }
2219
2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2221 {
2222         struct rt6_info *rt = (struct rt6_info *) dst;
2223
2224         if (rt) {
2225                 if (rt->rt6i_flags & RTF_CACHE) {
2226                         rcu_read_lock();
2227                         if (rt6_check_expired(rt)) {
2228                                 rt6_remove_exception_rt(rt);
2229                                 dst = NULL;
2230                         }
2231                         rcu_read_unlock();
2232                 } else {
2233                         dst_release(dst);
2234                         dst = NULL;
2235                 }
2236         }
2237         return dst;
2238 }
2239
2240 static void ip6_link_failure(struct sk_buff *skb)
2241 {
2242         struct rt6_info *rt;
2243
2244         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2245
2246         rt = (struct rt6_info *) skb_dst(skb);
2247         if (rt) {
2248                 rcu_read_lock();
2249                 if (rt->rt6i_flags & RTF_CACHE) {
2250                         rt6_remove_exception_rt(rt);
2251                 } else {
2252                         struct fib6_info *from;
2253                         struct fib6_node *fn;
2254
2255                         from = rcu_dereference(rt->from);
2256                         if (from) {
2257                                 fn = rcu_dereference(from->fib6_node);
2258                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2259                                         fn->fn_sernum = -1;
2260                         }
2261                 }
2262                 rcu_read_unlock();
2263         }
2264 }
2265
2266 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2267 {
2268         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2269                 struct fib6_info *from;
2270
2271                 rcu_read_lock();
2272                 from = rcu_dereference(rt0->from);
2273                 if (from)
2274                         rt0->dst.expires = from->expires;
2275                 rcu_read_unlock();
2276         }
2277
2278         dst_set_expires(&rt0->dst, timeout);
2279         rt0->rt6i_flags |= RTF_EXPIRES;
2280 }
2281
2282 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2283 {
2284         struct net *net = dev_net(rt->dst.dev);
2285
2286         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2287         rt->rt6i_flags |= RTF_MODIFIED;
2288         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2289 }
2290
2291 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2292 {
2293         return !(rt->rt6i_flags & RTF_CACHE) &&
2294                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2295 }
2296
2297 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2298                                  const struct ipv6hdr *iph, u32 mtu)
2299 {
2300         const struct in6_addr *daddr, *saddr;
2301         struct rt6_info *rt6 = (struct rt6_info *)dst;
2302
2303         if (dst_metric_locked(dst, RTAX_MTU))
2304                 return;
2305
2306         if (iph) {
2307                 daddr = &iph->daddr;
2308                 saddr = &iph->saddr;
2309         } else if (sk) {
2310                 daddr = &sk->sk_v6_daddr;
2311                 saddr = &inet6_sk(sk)->saddr;
2312         } else {
2313                 daddr = NULL;
2314                 saddr = NULL;
2315         }
2316         dst_confirm_neigh(dst, daddr);
2317         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2318         if (mtu >= dst_mtu(dst))
2319                 return;
2320
2321         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2322                 rt6_do_update_pmtu(rt6, mtu);
2323                 /* update rt6_ex->stamp for cache */
2324                 if (rt6->rt6i_flags & RTF_CACHE)
2325                         rt6_update_exception_stamp_rt(rt6);
2326         } else if (daddr) {
2327                 struct fib6_info *from;
2328                 struct rt6_info *nrt6;
2329
2330                 rcu_read_lock();
2331                 from = rcu_dereference(rt6->from);
2332                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2333                 if (nrt6) {
2334                         rt6_do_update_pmtu(nrt6, mtu);
2335                         if (rt6_insert_exception(nrt6, from))
2336                                 dst_release_immediate(&nrt6->dst);
2337                 }
2338                 rcu_read_unlock();
2339         }
2340 }
2341
2342 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2343                                struct sk_buff *skb, u32 mtu)
2344 {
2345         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2346 }
2347
2348 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2349                      int oif, u32 mark, kuid_t uid)
2350 {
2351         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2352         struct dst_entry *dst;
2353         struct flowi6 fl6 = {
2354                 .flowi6_oif = oif,
2355                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2356                 .daddr = iph->daddr,
2357                 .saddr = iph->saddr,
2358                 .flowlabel = ip6_flowinfo(iph),
2359                 .flowi6_uid = uid,
2360         };
2361
2362         dst = ip6_route_output(net, NULL, &fl6);
2363         if (!dst->error)
2364                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2365         dst_release(dst);
2366 }
2367 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2368
2369 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2370 {
2371         int oif = sk->sk_bound_dev_if;
2372         struct dst_entry *dst;
2373
2374         if (!oif && skb->dev)
2375                 oif = l3mdev_master_ifindex(skb->dev);
2376
2377         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2378
2379         dst = __sk_dst_get(sk);
2380         if (!dst || !dst->obsolete ||
2381             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2382                 return;
2383
2384         bh_lock_sock(sk);
2385         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2386                 ip6_datagram_dst_update(sk, false);
2387         bh_unlock_sock(sk);
2388 }
2389 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2390
2391 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2392                            const struct flowi6 *fl6)
2393 {
2394 #ifdef CONFIG_IPV6_SUBTREES
2395         struct ipv6_pinfo *np = inet6_sk(sk);
2396 #endif
2397
2398         ip6_dst_store(sk, dst,
2399                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2400                       &sk->sk_v6_daddr : NULL,
2401 #ifdef CONFIG_IPV6_SUBTREES
2402                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2403                       &np->saddr :
2404 #endif
2405                       NULL);
2406 }
2407
2408 /* Handle redirects */
2409 struct ip6rd_flowi {
2410         struct flowi6 fl6;
2411         struct in6_addr gateway;
2412 };
2413
2414 static struct rt6_info *__ip6_route_redirect(struct net *net,
2415                                              struct fib6_table *table,
2416                                              struct flowi6 *fl6,
2417                                              const struct sk_buff *skb,
2418                                              int flags)
2419 {
2420         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2421         struct rt6_info *ret = NULL, *rt_cache;
2422         struct fib6_info *rt;
2423         struct fib6_node *fn;
2424
2425         /* Get the "current" route for this destination and
2426          * check if the redirect has come from appropriate router.
2427          *
2428          * RFC 4861 specifies that redirects should only be
2429          * accepted if they come from the nexthop to the target.
2430          * Due to the way the routes are chosen, this notion
2431          * is a bit fuzzy and one might need to check all possible
2432          * routes.
2433          */
2434
2435         rcu_read_lock();
2436         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2437 restart:
2438         for_each_fib6_node_rt_rcu(fn) {
2439                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2440                         continue;
2441                 if (fib6_check_expired(rt))
2442                         continue;
2443                 if (rt->fib6_flags & RTF_REJECT)
2444                         break;
2445                 if (!(rt->fib6_flags & RTF_GATEWAY))
2446                         continue;
2447                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2448                         continue;
2449                 /* rt_cache's gateway might be different from its 'parent'
2450                  * in the case of an ip redirect.
2451                  * So we keep searching in the exception table if the gateway
2452                  * is different.
2453                  */
2454                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2455                         rt_cache = rt6_find_cached_rt(rt,
2456                                                       &fl6->daddr,
2457                                                       &fl6->saddr);
2458                         if (rt_cache &&
2459                             ipv6_addr_equal(&rdfl->gateway,
2460                                             &rt_cache->rt6i_gateway)) {
2461                                 ret = rt_cache;
2462                                 break;
2463                         }
2464                         continue;
2465                 }
2466                 break;
2467         }
2468
2469         if (!rt)
2470                 rt = net->ipv6.fib6_null_entry;
2471         else if (rt->fib6_flags & RTF_REJECT) {
2472                 ret = net->ipv6.ip6_null_entry;
2473                 goto out;
2474         }
2475
2476         if (rt == net->ipv6.fib6_null_entry) {
2477                 fn = fib6_backtrack(fn, &fl6->saddr);
2478                 if (fn)
2479                         goto restart;
2480         }
2481
2482 out:
2483         if (ret)
2484                 ip6_hold_safe(net, &ret);
2485         else
2486                 ret = ip6_create_rt_rcu(rt);
2487
2488         rcu_read_unlock();
2489
2490         trace_fib6_table_lookup(net, rt, table, fl6);
2491         return ret;
2492 };
2493
2494 static struct dst_entry *ip6_route_redirect(struct net *net,
2495                                             const struct flowi6 *fl6,
2496                                             const struct sk_buff *skb,
2497                                             const struct in6_addr *gateway)
2498 {
2499         int flags = RT6_LOOKUP_F_HAS_SADDR;
2500         struct ip6rd_flowi rdfl;
2501
2502         rdfl.fl6 = *fl6;
2503         rdfl.gateway = *gateway;
2504
2505         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2506                                 flags, __ip6_route_redirect);
2507 }
2508
2509 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2510                   kuid_t uid)
2511 {
2512         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2513         struct dst_entry *dst;
2514         struct flowi6 fl6 = {
2515                 .flowi6_iif = LOOPBACK_IFINDEX,
2516                 .flowi6_oif = oif,
2517                 .flowi6_mark = mark,
2518                 .daddr = iph->daddr,
2519                 .saddr = iph->saddr,
2520                 .flowlabel = ip6_flowinfo(iph),
2521                 .flowi6_uid = uid,
2522         };
2523
2524         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2525         rt6_do_redirect(dst, NULL, skb);
2526         dst_release(dst);
2527 }
2528 EXPORT_SYMBOL_GPL(ip6_redirect);
2529
2530 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2531 {
2532         const struct ipv6hdr *iph = ipv6_hdr(skb);
2533         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2534         struct dst_entry *dst;
2535         struct flowi6 fl6 = {
2536                 .flowi6_iif = LOOPBACK_IFINDEX,
2537                 .flowi6_oif = oif,
2538                 .daddr = msg->dest,
2539                 .saddr = iph->daddr,
2540                 .flowi6_uid = sock_net_uid(net, NULL),
2541         };
2542
2543         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2544         rt6_do_redirect(dst, NULL, skb);
2545         dst_release(dst);
2546 }
2547
2548 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2549 {
2550         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2551                      sk->sk_uid);
2552 }
2553 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2554
2555 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2556 {
2557         struct net_device *dev = dst->dev;
2558         unsigned int mtu = dst_mtu(dst);
2559         struct net *net = dev_net(dev);
2560
2561         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2562
2563         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2564                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2565
2566         /*
2567          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2568          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2569          * IPV6_MAXPLEN is also valid and means: "any MSS,
2570          * rely only on pmtu discovery"
2571          */
2572         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2573                 mtu = IPV6_MAXPLEN;
2574         return mtu;
2575 }
2576
2577 static unsigned int ip6_mtu(const struct dst_entry *dst)
2578 {
2579         struct inet6_dev *idev;
2580         unsigned int mtu;
2581
2582         mtu = dst_metric_raw(dst, RTAX_MTU);
2583         if (mtu)
2584                 goto out;
2585
2586         mtu = IPV6_MIN_MTU;
2587
2588         rcu_read_lock();
2589         idev = __in6_dev_get(dst->dev);
2590         if (idev)
2591                 mtu = idev->cnf.mtu6;
2592         rcu_read_unlock();
2593
2594 out:
2595         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2596
2597         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2598 }
2599
2600 /* MTU selection:
2601  * 1. mtu on route is locked - use it
2602  * 2. mtu from nexthop exception
2603  * 3. mtu from egress device
2604  *
2605  * based on ip6_dst_mtu_forward and exception logic of
2606  * rt6_find_cached_rt; called with rcu_read_lock
2607  */
2608 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2609                       struct in6_addr *saddr)
2610 {
2611         struct rt6_exception_bucket *bucket;
2612         struct rt6_exception *rt6_ex;
2613         struct in6_addr *src_key;
2614         struct inet6_dev *idev;
2615         u32 mtu = 0;
2616
2617         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2618                 mtu = f6i->fib6_pmtu;
2619                 if (mtu)
2620                         goto out;
2621         }
2622
2623         src_key = NULL;
2624 #ifdef CONFIG_IPV6_SUBTREES
2625         if (f6i->fib6_src.plen)
2626                 src_key = saddr;
2627 #endif
2628
2629         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2630         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2631         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2632                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2633
2634         if (likely(!mtu)) {
2635                 struct net_device *dev = fib6_info_nh_dev(f6i);
2636
2637                 mtu = IPV6_MIN_MTU;
2638                 idev = __in6_dev_get(dev);
2639                 if (idev && idev->cnf.mtu6 > mtu)
2640                         mtu = idev->cnf.mtu6;
2641         }
2642
2643         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2644 out:
2645         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2646 }
2647
2648 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2649                                   struct flowi6 *fl6)
2650 {
2651         struct dst_entry *dst;
2652         struct rt6_info *rt;
2653         struct inet6_dev *idev = in6_dev_get(dev);
2654         struct net *net = dev_net(dev);
2655
2656         if (unlikely(!idev))
2657                 return ERR_PTR(-ENODEV);
2658
2659         rt = ip6_dst_alloc(net, dev, 0);
2660         if (unlikely(!rt)) {
2661                 in6_dev_put(idev);
2662                 dst = ERR_PTR(-ENOMEM);
2663                 goto out;
2664         }
2665
2666         rt->dst.flags |= DST_HOST;
2667         rt->dst.input = ip6_input;
2668         rt->dst.output  = ip6_output;
2669         rt->rt6i_gateway  = fl6->daddr;
2670         rt->rt6i_dst.addr = fl6->daddr;
2671         rt->rt6i_dst.plen = 128;
2672         rt->rt6i_idev     = idev;
2673         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2674
2675         /* Add this dst into uncached_list so that rt6_disable_ip() can
2676          * do proper release of the net_device
2677          */
2678         rt6_uncached_list_add(rt);
2679         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2680
2681         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2682
2683 out:
2684         return dst;
2685 }
2686
2687 static int ip6_dst_gc(struct dst_ops *ops)
2688 {
2689         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2690         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2691         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2692         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2693         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2694         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2695         int entries;
2696
2697         entries = dst_entries_get_fast(ops);
2698         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2699             entries <= rt_max_size)
2700                 goto out;
2701
2702         net->ipv6.ip6_rt_gc_expire++;
2703         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2704         entries = dst_entries_get_slow(ops);
2705         if (entries < ops->gc_thresh)
2706                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2707 out:
2708         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2709         return entries > rt_max_size;
2710 }
2711
2712 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2713                                             struct fib6_config *cfg,
2714                                             const struct in6_addr *gw_addr,
2715                                             u32 tbid, int flags)
2716 {
2717         struct flowi6 fl6 = {
2718                 .flowi6_oif = cfg->fc_ifindex,
2719                 .daddr = *gw_addr,
2720                 .saddr = cfg->fc_prefsrc,
2721         };
2722         struct fib6_table *table;
2723         struct rt6_info *rt;
2724
2725         table = fib6_get_table(net, tbid);
2726         if (!table)
2727                 return NULL;
2728
2729         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2730                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2731
2732         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2733         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2734
2735         /* if table lookup failed, fall back to full lookup */
2736         if (rt == net->ipv6.ip6_null_entry) {
2737                 ip6_rt_put(rt);
2738                 rt = NULL;
2739         }
2740
2741         return rt;
2742 }
2743
2744 static int ip6_route_check_nh_onlink(struct net *net,
2745                                      struct fib6_config *cfg,
2746                                      const struct net_device *dev,
2747                                      struct netlink_ext_ack *extack)
2748 {
2749         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2750         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2751         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2752         struct fib6_info *from;
2753         struct rt6_info *grt;
2754         int err;
2755
2756         err = 0;
2757         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2758         if (grt) {
2759                 rcu_read_lock();
2760                 from = rcu_dereference(grt->from);
2761                 if (!grt->dst.error &&
2762                     /* ignore match if it is the default route */
2763                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2764                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2765                         NL_SET_ERR_MSG(extack,
2766                                        "Nexthop has invalid gateway or device mismatch");
2767                         err = -EINVAL;
2768                 }
2769                 rcu_read_unlock();
2770
2771                 ip6_rt_put(grt);
2772         }
2773
2774         return err;
2775 }
2776
2777 static int ip6_route_check_nh(struct net *net,
2778                               struct fib6_config *cfg,
2779                               struct net_device **_dev,
2780                               struct inet6_dev **idev)
2781 {
2782         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2783         struct net_device *dev = _dev ? *_dev : NULL;
2784         struct rt6_info *grt = NULL;
2785         int err = -EHOSTUNREACH;
2786
2787         if (cfg->fc_table) {
2788                 int flags = RT6_LOOKUP_F_IFACE;
2789
2790                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2791                                           cfg->fc_table, flags);
2792                 if (grt) {
2793                         if (grt->rt6i_flags & RTF_GATEWAY ||
2794                             (dev && dev != grt->dst.dev)) {
2795                                 ip6_rt_put(grt);
2796                                 grt = NULL;
2797                         }
2798                 }
2799         }
2800
2801         if (!grt)
2802                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2803
2804         if (!grt)
2805                 goto out;
2806
2807         if (dev) {
2808                 if (dev != grt->dst.dev) {
2809                         ip6_rt_put(grt);
2810                         goto out;
2811                 }
2812         } else {
2813                 *_dev = dev = grt->dst.dev;
2814                 *idev = grt->rt6i_idev;
2815                 dev_hold(dev);
2816                 in6_dev_hold(grt->rt6i_idev);
2817         }
2818
2819         if (!(grt->rt6i_flags & RTF_GATEWAY))
2820                 err = 0;
2821
2822         ip6_rt_put(grt);
2823
2824 out:
2825         return err;
2826 }
2827
2828 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2829                            struct net_device **_dev, struct inet6_dev **idev,
2830                            struct netlink_ext_ack *extack)
2831 {
2832         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2833         int gwa_type = ipv6_addr_type(gw_addr);
2834         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2835         const struct net_device *dev = *_dev;
2836         bool need_addr_check = !dev;
2837         int err = -EINVAL;
2838
2839         /* if gw_addr is local we will fail to detect this in case
2840          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2841          * will return already-added prefix route via interface that
2842          * prefix route was assigned to, which might be non-loopback.
2843          */
2844         if (dev &&
2845             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2846                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2847                 goto out;
2848         }
2849
2850         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2851                 /* IPv6 strictly inhibits using not link-local
2852                  * addresses as nexthop address.
2853                  * Otherwise, router will not able to send redirects.
2854                  * It is very good, but in some (rare!) circumstances
2855                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2856                  * some exceptions. --ANK
2857                  * We allow IPv4-mapped nexthops to support RFC4798-type
2858                  * addressing
2859                  */
2860                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2861                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2862                         goto out;
2863                 }
2864
2865                 if (cfg->fc_flags & RTNH_F_ONLINK)
2866                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2867                 else
2868                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2869
2870                 if (err)
2871                         goto out;
2872         }
2873
2874         /* reload in case device was changed */
2875         dev = *_dev;
2876
2877         err = -EINVAL;
2878         if (!dev) {
2879                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2880                 goto out;
2881         } else if (dev->flags & IFF_LOOPBACK) {
2882                 NL_SET_ERR_MSG(extack,
2883                                "Egress device can not be loopback device for this route");
2884                 goto out;
2885         }
2886
2887         /* if we did not check gw_addr above, do so now that the
2888          * egress device has been resolved.
2889          */
2890         if (need_addr_check &&
2891             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2892                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2893                 goto out;
2894         }
2895
2896         err = 0;
2897 out:
2898         return err;
2899 }
2900
2901 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2902 {
2903         if ((flags & RTF_REJECT) ||
2904             (dev && (dev->flags & IFF_LOOPBACK) &&
2905              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2906              !(flags & RTF_LOCAL)))
2907                 return true;
2908
2909         return false;
2910 }
2911
2912 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2913                  struct fib6_config *cfg, gfp_t gfp_flags,
2914                  struct netlink_ext_ack *extack)
2915 {
2916         struct net_device *dev = NULL;
2917         struct inet6_dev *idev = NULL;
2918         int addr_type;
2919         int err;
2920
2921         err = -ENODEV;
2922         if (cfg->fc_ifindex) {
2923                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2924                 if (!dev)
2925                         goto out;
2926                 idev = in6_dev_get(dev);
2927                 if (!idev)
2928                         goto out;
2929         }
2930
2931         if (cfg->fc_flags & RTNH_F_ONLINK) {
2932                 if (!dev) {
2933                         NL_SET_ERR_MSG(extack,
2934                                        "Nexthop device required for onlink");
2935                         goto out;
2936                 }
2937
2938                 if (!(dev->flags & IFF_UP)) {
2939                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2940                         err = -ENETDOWN;
2941                         goto out;
2942                 }
2943
2944                 fib6_nh->nh_flags |= RTNH_F_ONLINK;
2945         }
2946
2947         if (cfg->fc_encap) {
2948                 struct lwtunnel_state *lwtstate;
2949
2950                 err = lwtunnel_build_state(cfg->fc_encap_type,
2951                                            cfg->fc_encap, AF_INET6, cfg,
2952                                            &lwtstate, extack);
2953                 if (err)
2954                         goto out;
2955
2956                 fib6_nh->nh_lwtstate = lwtstate_get(lwtstate);
2957         }
2958
2959         fib6_nh->nh_weight = 1;
2960
2961         /* We cannot add true routes via loopback here,
2962          * they would result in kernel looping; promote them to reject routes
2963          */
2964         addr_type = ipv6_addr_type(&cfg->fc_dst);
2965         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2966                 /* hold loopback dev/idev if we haven't done so. */
2967                 if (dev != net->loopback_dev) {
2968                         if (dev) {
2969                                 dev_put(dev);
2970                                 in6_dev_put(idev);
2971                         }
2972                         dev = net->loopback_dev;
2973                         dev_hold(dev);
2974                         idev = in6_dev_get(dev);
2975                         if (!idev) {
2976                                 err = -ENODEV;
2977                                 goto out;
2978                         }
2979                 }
2980                 goto set_dev;
2981         }
2982
2983         if (cfg->fc_flags & RTF_GATEWAY) {
2984                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2985                 if (err)
2986                         goto out;
2987
2988                 fib6_nh->nh_gw = cfg->fc_gateway;
2989         }
2990
2991         err = -ENODEV;
2992         if (!dev)
2993                 goto out;
2994
2995         if (idev->cnf.disable_ipv6) {
2996                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2997                 err = -EACCES;
2998                 goto out;
2999         }
3000
3001         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3002                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3003                 err = -ENETDOWN;
3004                 goto out;
3005         }
3006
3007         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3008             !netif_carrier_ok(dev))
3009                 fib6_nh->nh_flags |= RTNH_F_LINKDOWN;
3010
3011 set_dev:
3012         fib6_nh->nh_dev = dev;
3013         err = 0;
3014 out:
3015         if (idev)
3016                 in6_dev_put(idev);
3017
3018         if (err) {
3019                 lwtstate_put(fib6_nh->nh_lwtstate);
3020                 fib6_nh->nh_lwtstate = NULL;
3021                 if (dev)
3022                         dev_put(dev);
3023         }
3024
3025         return err;
3026 }
3027
3028 void fib6_nh_release(struct fib6_nh *fib6_nh)
3029 {
3030         lwtstate_put(fib6_nh->nh_lwtstate);
3031
3032         if (fib6_nh->nh_dev)
3033                 dev_put(fib6_nh->nh_dev);
3034 }
3035
3036 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3037                                               gfp_t gfp_flags,
3038                                               struct netlink_ext_ack *extack)
3039 {
3040         struct net *net = cfg->fc_nlinfo.nl_net;
3041         struct fib6_info *rt = NULL;
3042         struct fib6_table *table;
3043         int err = -EINVAL;
3044         int addr_type;
3045
3046         /* RTF_PCPU is an internal flag; can not be set by userspace */
3047         if (cfg->fc_flags & RTF_PCPU) {
3048                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3049                 goto out;
3050         }
3051
3052         /* RTF_CACHE is an internal flag; can not be set by userspace */
3053         if (cfg->fc_flags & RTF_CACHE) {
3054                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3055                 goto out;
3056         }
3057
3058         if (cfg->fc_type > RTN_MAX) {
3059                 NL_SET_ERR_MSG(extack, "Invalid route type");
3060                 goto out;
3061         }
3062
3063         if (cfg->fc_dst_len > 128) {
3064                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3065                 goto out;
3066         }
3067         if (cfg->fc_src_len > 128) {
3068                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3069                 goto out;
3070         }
3071 #ifndef CONFIG_IPV6_SUBTREES
3072         if (cfg->fc_src_len) {
3073                 NL_SET_ERR_MSG(extack,
3074                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3075                 goto out;
3076         }
3077 #endif
3078
3079         err = -ENOBUFS;
3080         if (cfg->fc_nlinfo.nlh &&
3081             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3082                 table = fib6_get_table(net, cfg->fc_table);
3083                 if (!table) {
3084                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3085                         table = fib6_new_table(net, cfg->fc_table);
3086                 }
3087         } else {
3088                 table = fib6_new_table(net, cfg->fc_table);
3089         }
3090
3091         if (!table)
3092                 goto out;
3093
3094         err = -ENOMEM;
3095         rt = fib6_info_alloc(gfp_flags);
3096         if (!rt)
3097                 goto out;
3098
3099         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3100                                                extack);
3101         if (IS_ERR(rt->fib6_metrics)) {
3102                 err = PTR_ERR(rt->fib6_metrics);
3103                 /* Do not leave garbage there. */
3104                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3105                 goto out;
3106         }
3107
3108         if (cfg->fc_flags & RTF_ADDRCONF)
3109                 rt->dst_nocount = true;
3110
3111         if (cfg->fc_flags & RTF_EXPIRES)
3112                 fib6_set_expires(rt, jiffies +
3113                                 clock_t_to_jiffies(cfg->fc_expires));
3114         else
3115                 fib6_clean_expires(rt);
3116
3117         if (cfg->fc_protocol == RTPROT_UNSPEC)
3118                 cfg->fc_protocol = RTPROT_BOOT;
3119         rt->fib6_protocol = cfg->fc_protocol;
3120
3121         rt->fib6_table = table;
3122         rt->fib6_metric = cfg->fc_metric;
3123         rt->fib6_type = cfg->fc_type;
3124         rt->fib6_flags = cfg->fc_flags;
3125
3126         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3127         rt->fib6_dst.plen = cfg->fc_dst_len;
3128         if (rt->fib6_dst.plen == 128)
3129                 rt->dst_host = true;
3130
3131 #ifdef CONFIG_IPV6_SUBTREES
3132         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3133         rt->fib6_src.plen = cfg->fc_src_len;
3134 #endif
3135         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3136         if (err)
3137                 goto out;
3138
3139         /* We cannot add true routes via loopback here,
3140          * they would result in kernel looping; promote them to reject routes
3141          */
3142         addr_type = ipv6_addr_type(&cfg->fc_dst);
3143         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.nh_dev, addr_type))
3144                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3145
3146         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3147                 struct net_device *dev = fib6_info_nh_dev(rt);
3148
3149                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3150                         NL_SET_ERR_MSG(extack, "Invalid source address");
3151                         err = -EINVAL;
3152                         goto out;
3153                 }
3154                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3155                 rt->fib6_prefsrc.plen = 128;
3156         } else
3157                 rt->fib6_prefsrc.plen = 0;
3158
3159         return rt;
3160 out:
3161         fib6_info_release(rt);
3162         return ERR_PTR(err);
3163 }
3164
3165 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3166                   struct netlink_ext_ack *extack)
3167 {
3168         struct fib6_info *rt;
3169         int err;
3170
3171         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3172         if (IS_ERR(rt))
3173                 return PTR_ERR(rt);
3174
3175         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3176         fib6_info_release(rt);
3177
3178         return err;
3179 }
3180
3181 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3182 {
3183         struct net *net = info->nl_net;
3184         struct fib6_table *table;
3185         int err;
3186
3187         if (rt == net->ipv6.fib6_null_entry) {
3188                 err = -ENOENT;
3189                 goto out;
3190         }
3191
3192         table = rt->fib6_table;
3193         spin_lock_bh(&table->tb6_lock);
3194         err = fib6_del(rt, info);
3195         spin_unlock_bh(&table->tb6_lock);
3196
3197 out:
3198         fib6_info_release(rt);
3199         return err;
3200 }
3201
3202 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3203 {
3204         struct nl_info info = { .nl_net = net };
3205
3206         return __ip6_del_rt(rt, &info);
3207 }
3208
3209 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3210 {
3211         struct nl_info *info = &cfg->fc_nlinfo;
3212         struct net *net = info->nl_net;
3213         struct sk_buff *skb = NULL;
3214         struct fib6_table *table;
3215         int err = -ENOENT;
3216
3217         if (rt == net->ipv6.fib6_null_entry)
3218                 goto out_put;
3219         table = rt->fib6_table;
3220         spin_lock_bh(&table->tb6_lock);
3221
3222         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3223                 struct fib6_info *sibling, *next_sibling;
3224
3225                 /* prefer to send a single notification with all hops */
3226                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3227                 if (skb) {
3228                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3229
3230                         if (rt6_fill_node(net, skb, rt, NULL,
3231                                           NULL, NULL, 0, RTM_DELROUTE,
3232                                           info->portid, seq, 0) < 0) {
3233                                 kfree_skb(skb);
3234                                 skb = NULL;
3235                         } else
3236                                 info->skip_notify = 1;
3237                 }
3238
3239                 list_for_each_entry_safe(sibling, next_sibling,
3240                                          &rt->fib6_siblings,
3241                                          fib6_siblings) {
3242                         err = fib6_del(sibling, info);
3243                         if (err)
3244                                 goto out_unlock;
3245                 }
3246         }
3247
3248         err = fib6_del(rt, info);
3249 out_unlock:
3250         spin_unlock_bh(&table->tb6_lock);
3251 out_put:
3252         fib6_info_release(rt);
3253
3254         if (skb) {
3255                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3256                             info->nlh, gfp_any());
3257         }
3258         return err;
3259 }
3260
3261 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3262 {
3263         int rc = -ESRCH;
3264
3265         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3266                 goto out;
3267
3268         if (cfg->fc_flags & RTF_GATEWAY &&
3269             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3270                 goto out;
3271
3272         rc = rt6_remove_exception_rt(rt);
3273 out:
3274         return rc;
3275 }
3276
3277 static int ip6_route_del(struct fib6_config *cfg,
3278                          struct netlink_ext_ack *extack)
3279 {
3280         struct rt6_info *rt_cache;
3281         struct fib6_table *table;
3282         struct fib6_info *rt;
3283         struct fib6_node *fn;
3284         int err = -ESRCH;
3285
3286         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3287         if (!table) {
3288                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3289                 return err;
3290         }
3291
3292         rcu_read_lock();
3293
3294         fn = fib6_locate(&table->tb6_root,
3295                          &cfg->fc_dst, cfg->fc_dst_len,
3296                          &cfg->fc_src, cfg->fc_src_len,
3297                          !(cfg->fc_flags & RTF_CACHE));
3298
3299         if (fn) {
3300                 for_each_fib6_node_rt_rcu(fn) {
3301                         if (cfg->fc_flags & RTF_CACHE) {
3302                                 int rc;
3303
3304                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3305                                                               &cfg->fc_src);
3306                                 if (rt_cache) {
3307                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3308                                         if (rc != -ESRCH) {
3309                                                 rcu_read_unlock();
3310                                                 return rc;
3311                                         }
3312                                 }
3313                                 continue;
3314                         }
3315                         if (cfg->fc_ifindex &&
3316                             (!rt->fib6_nh.nh_dev ||
3317                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3318                                 continue;
3319                         if (cfg->fc_flags & RTF_GATEWAY &&
3320                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3321                                 continue;
3322                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3323                                 continue;
3324                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3325                                 continue;
3326                         if (!fib6_info_hold_safe(rt))
3327                                 continue;
3328                         rcu_read_unlock();
3329
3330                         /* if gateway was specified only delete the one hop */
3331                         if (cfg->fc_flags & RTF_GATEWAY)
3332                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3333
3334                         return __ip6_del_rt_siblings(rt, cfg);
3335                 }
3336         }
3337         rcu_read_unlock();
3338
3339         return err;
3340 }
3341
3342 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3343 {
3344         struct netevent_redirect netevent;
3345         struct rt6_info *rt, *nrt = NULL;
3346         struct ndisc_options ndopts;
3347         struct inet6_dev *in6_dev;
3348         struct neighbour *neigh;
3349         struct fib6_info *from;
3350         struct rd_msg *msg;
3351         int optlen, on_link;
3352         u8 *lladdr;
3353
3354         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3355         optlen -= sizeof(*msg);
3356
3357         if (optlen < 0) {
3358                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3359                 return;
3360         }
3361
3362         msg = (struct rd_msg *)icmp6_hdr(skb);
3363
3364         if (ipv6_addr_is_multicast(&msg->dest)) {
3365                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3366                 return;
3367         }
3368
3369         on_link = 0;
3370         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3371                 on_link = 1;
3372         } else if (ipv6_addr_type(&msg->target) !=
3373                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3374                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3375                 return;
3376         }
3377
3378         in6_dev = __in6_dev_get(skb->dev);
3379         if (!in6_dev)
3380                 return;
3381         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3382                 return;
3383
3384         /* RFC2461 8.1:
3385          *      The IP source address of the Redirect MUST be the same as the current
3386          *      first-hop router for the specified ICMP Destination Address.
3387          */
3388
3389         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3390                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3391                 return;
3392         }
3393
3394         lladdr = NULL;
3395         if (ndopts.nd_opts_tgt_lladdr) {
3396                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3397                                              skb->dev);
3398                 if (!lladdr) {
3399                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3400                         return;
3401                 }
3402         }
3403
3404         rt = (struct rt6_info *) dst;
3405         if (rt->rt6i_flags & RTF_REJECT) {
3406                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3407                 return;
3408         }
3409
3410         /* Redirect received -> path was valid.
3411          * Look, redirects are sent only in response to data packets,
3412          * so that this nexthop apparently is reachable. --ANK
3413          */
3414         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3415
3416         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3417         if (!neigh)
3418                 return;
3419
3420         /*
3421          *      We have finally decided to accept it.
3422          */
3423
3424         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3425                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3426                      NEIGH_UPDATE_F_OVERRIDE|
3427                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3428                                      NEIGH_UPDATE_F_ISROUTER)),
3429                      NDISC_REDIRECT, &ndopts);
3430
3431         rcu_read_lock();
3432         from = rcu_dereference(rt->from);
3433         /* This fib6_info_hold() is safe here because we hold reference to rt
3434          * and rt already holds reference to fib6_info.
3435          */
3436         fib6_info_hold(from);
3437         rcu_read_unlock();
3438
3439         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3440         if (!nrt)
3441                 goto out;
3442
3443         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3444         if (on_link)
3445                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3446
3447         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3448
3449         /* No need to remove rt from the exception table if rt is
3450          * a cached route because rt6_insert_exception() will
3451          * takes care of it
3452          */
3453         if (rt6_insert_exception(nrt, from)) {
3454                 dst_release_immediate(&nrt->dst);
3455                 goto out;
3456         }
3457
3458         netevent.old = &rt->dst;
3459         netevent.new = &nrt->dst;
3460         netevent.daddr = &msg->dest;
3461         netevent.neigh = neigh;
3462         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3463
3464 out:
3465         fib6_info_release(from);
3466         neigh_release(neigh);
3467 }
3468
3469 #ifdef CONFIG_IPV6_ROUTE_INFO
3470 static struct fib6_info *rt6_get_route_info(struct net *net,
3471                                            const struct in6_addr *prefix, int prefixlen,
3472                                            const struct in6_addr *gwaddr,
3473                                            struct net_device *dev)
3474 {
3475         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3476         int ifindex = dev->ifindex;
3477         struct fib6_node *fn;
3478         struct fib6_info *rt = NULL;
3479         struct fib6_table *table;
3480
3481         table = fib6_get_table(net, tb_id);
3482         if (!table)
3483                 return NULL;
3484
3485         rcu_read_lock();
3486         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3487         if (!fn)
3488                 goto out;
3489
3490         for_each_fib6_node_rt_rcu(fn) {
3491                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3492                         continue;
3493                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3494                         continue;
3495                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3496                         continue;
3497                 if (!fib6_info_hold_safe(rt))
3498                         continue;
3499                 break;
3500         }
3501 out:
3502         rcu_read_unlock();
3503         return rt;
3504 }
3505
3506 static struct fib6_info *rt6_add_route_info(struct net *net,
3507                                            const struct in6_addr *prefix, int prefixlen,
3508                                            const struct in6_addr *gwaddr,
3509                                            struct net_device *dev,
3510                                            unsigned int pref)
3511 {
3512         struct fib6_config cfg = {
3513                 .fc_metric      = IP6_RT_PRIO_USER,
3514                 .fc_ifindex     = dev->ifindex,
3515                 .fc_dst_len     = prefixlen,
3516                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3517                                   RTF_UP | RTF_PREF(pref),
3518                 .fc_protocol = RTPROT_RA,
3519                 .fc_type = RTN_UNICAST,
3520                 .fc_nlinfo.portid = 0,
3521                 .fc_nlinfo.nlh = NULL,
3522                 .fc_nlinfo.nl_net = net,
3523         };
3524
3525         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3526         cfg.fc_dst = *prefix;
3527         cfg.fc_gateway = *gwaddr;
3528
3529         /* We should treat it as a default route if prefix length is 0. */
3530         if (!prefixlen)
3531                 cfg.fc_flags |= RTF_DEFAULT;
3532
3533         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3534
3535         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3536 }
3537 #endif
3538
3539 struct fib6_info *rt6_get_dflt_router(struct net *net,
3540                                      const struct in6_addr *addr,
3541                                      struct net_device *dev)
3542 {
3543         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3544         struct fib6_info *rt;
3545         struct fib6_table *table;
3546
3547         table = fib6_get_table(net, tb_id);
3548         if (!table)
3549                 return NULL;
3550
3551         rcu_read_lock();
3552         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3553                 if (dev == rt->fib6_nh.nh_dev &&
3554                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3555                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3556                         break;
3557         }
3558         if (rt && !fib6_info_hold_safe(rt))
3559                 rt = NULL;
3560         rcu_read_unlock();
3561         return rt;
3562 }
3563
3564 struct fib6_info *rt6_add_dflt_router(struct net *net,
3565                                      const struct in6_addr *gwaddr,
3566                                      struct net_device *dev,
3567                                      unsigned int pref)
3568 {
3569         struct fib6_config cfg = {
3570                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3571                 .fc_metric      = IP6_RT_PRIO_USER,
3572                 .fc_ifindex     = dev->ifindex,
3573                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3574                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3575                 .fc_protocol = RTPROT_RA,
3576                 .fc_type = RTN_UNICAST,
3577                 .fc_nlinfo.portid = 0,
3578                 .fc_nlinfo.nlh = NULL,
3579                 .fc_nlinfo.nl_net = net,
3580         };
3581
3582         cfg.fc_gateway = *gwaddr;
3583
3584         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3585                 struct fib6_table *table;
3586
3587                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3588                 if (table)
3589                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3590         }
3591
3592         return rt6_get_dflt_router(net, gwaddr, dev);
3593 }
3594
3595 static void __rt6_purge_dflt_routers(struct net *net,
3596                                      struct fib6_table *table)
3597 {
3598         struct fib6_info *rt;
3599
3600 restart:
3601         rcu_read_lock();
3602         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3603                 struct net_device *dev = fib6_info_nh_dev(rt);
3604                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3605
3606                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3607                     (!idev || idev->cnf.accept_ra != 2) &&
3608                     fib6_info_hold_safe(rt)) {
3609                         rcu_read_unlock();
3610                         ip6_del_rt(net, rt);
3611                         goto restart;
3612                 }
3613         }
3614         rcu_read_unlock();
3615
3616         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3617 }
3618
3619 void rt6_purge_dflt_routers(struct net *net)
3620 {
3621         struct fib6_table *table;
3622         struct hlist_head *head;
3623         unsigned int h;
3624
3625         rcu_read_lock();
3626
3627         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3628                 head = &net->ipv6.fib_table_hash[h];
3629                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3630                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3631                                 __rt6_purge_dflt_routers(net, table);
3632                 }
3633         }
3634
3635         rcu_read_unlock();
3636 }
3637
3638 static void rtmsg_to_fib6_config(struct net *net,
3639                                  struct in6_rtmsg *rtmsg,
3640                                  struct fib6_config *cfg)
3641 {
3642         *cfg = (struct fib6_config){
3643                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3644                          : RT6_TABLE_MAIN,
3645                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3646                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3647                 .fc_expires = rtmsg->rtmsg_info,
3648                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3649                 .fc_src_len = rtmsg->rtmsg_src_len,
3650                 .fc_flags = rtmsg->rtmsg_flags,
3651                 .fc_type = rtmsg->rtmsg_type,
3652
3653                 .fc_nlinfo.nl_net = net,
3654
3655                 .fc_dst = rtmsg->rtmsg_dst,
3656                 .fc_src = rtmsg->rtmsg_src,
3657                 .fc_gateway = rtmsg->rtmsg_gateway,
3658         };
3659 }
3660
3661 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3662 {
3663         struct fib6_config cfg;
3664         struct in6_rtmsg rtmsg;
3665         int err;
3666
3667         switch (cmd) {
3668         case SIOCADDRT:         /* Add a route */
3669         case SIOCDELRT:         /* Delete a route */
3670                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3671                         return -EPERM;
3672                 err = copy_from_user(&rtmsg, arg,
3673                                      sizeof(struct in6_rtmsg));
3674                 if (err)
3675                         return -EFAULT;
3676
3677                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3678
3679                 rtnl_lock();
3680                 switch (cmd) {
3681                 case SIOCADDRT:
3682                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3683                         break;
3684                 case SIOCDELRT:
3685                         err = ip6_route_del(&cfg, NULL);
3686                         break;
3687                 default:
3688                         err = -EINVAL;
3689                 }
3690                 rtnl_unlock();
3691
3692                 return err;
3693         }
3694
3695         return -EINVAL;
3696 }
3697
3698 /*
3699  *      Drop the packet on the floor
3700  */
3701
3702 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3703 {
3704         int type;
3705         struct dst_entry *dst = skb_dst(skb);
3706         switch (ipstats_mib_noroutes) {
3707         case IPSTATS_MIB_INNOROUTES:
3708                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3709                 if (type == IPV6_ADDR_ANY) {
3710                         IP6_INC_STATS(dev_net(dst->dev),
3711                                       __in6_dev_get_safely(skb->dev),
3712                                       IPSTATS_MIB_INADDRERRORS);
3713                         break;
3714                 }
3715                 /* FALLTHROUGH */
3716         case IPSTATS_MIB_OUTNOROUTES:
3717                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3718                               ipstats_mib_noroutes);
3719                 break;
3720         }
3721         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3722         kfree_skb(skb);
3723         return 0;
3724 }
3725
3726 static int ip6_pkt_discard(struct sk_buff *skb)
3727 {
3728         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3729 }
3730
3731 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3732 {
3733         skb->dev = skb_dst(skb)->dev;
3734         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3735 }
3736
3737 static int ip6_pkt_prohibit(struct sk_buff *skb)
3738 {
3739         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3740 }
3741
3742 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3743 {
3744         skb->dev = skb_dst(skb)->dev;
3745         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3746 }
3747
3748 /*
3749  *      Allocate a dst for local (unicast / anycast) address.
3750  */
3751
3752 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3753                                      struct inet6_dev *idev,
3754                                      const struct in6_addr *addr,
3755                                      bool anycast, gfp_t gfp_flags)
3756 {
3757         struct fib6_config cfg = {
3758                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3759                 .fc_ifindex = idev->dev->ifindex,
3760                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3761                 .fc_dst = *addr,
3762                 .fc_dst_len = 128,
3763                 .fc_protocol = RTPROT_KERNEL,
3764                 .fc_nlinfo.nl_net = net,
3765                 .fc_ignore_dev_down = true,
3766         };
3767
3768         if (anycast) {
3769                 cfg.fc_type = RTN_ANYCAST;
3770                 cfg.fc_flags |= RTF_ANYCAST;
3771         } else {
3772                 cfg.fc_type = RTN_LOCAL;
3773                 cfg.fc_flags |= RTF_LOCAL;
3774         }
3775
3776         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3777 }
3778
3779 /* remove deleted ip from prefsrc entries */
3780 struct arg_dev_net_ip {
3781         struct net_device *dev;
3782         struct net *net;
3783         struct in6_addr *addr;
3784 };
3785
3786 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3787 {
3788         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3789         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3790         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3791
3792         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3793             rt != net->ipv6.fib6_null_entry &&
3794             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3795                 spin_lock_bh(&rt6_exception_lock);
3796                 /* remove prefsrc entry */
3797                 rt->fib6_prefsrc.plen = 0;
3798                 spin_unlock_bh(&rt6_exception_lock);
3799         }
3800         return 0;
3801 }
3802
3803 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3804 {
3805         struct net *net = dev_net(ifp->idev->dev);
3806         struct arg_dev_net_ip adni = {
3807                 .dev = ifp->idev->dev,
3808                 .net = net,
3809                 .addr = &ifp->addr,
3810         };
3811         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3812 }
3813
3814 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3815
3816 /* Remove routers and update dst entries when gateway turn into host. */
3817 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3818 {
3819         struct in6_addr *gateway = (struct in6_addr *)arg;
3820
3821         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3822             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3823                 return -1;
3824         }
3825
3826         /* Further clean up cached routes in exception table.
3827          * This is needed because cached route may have a different
3828          * gateway than its 'parent' in the case of an ip redirect.
3829          */
3830         rt6_exceptions_clean_tohost(rt, gateway);
3831
3832         return 0;
3833 }
3834
3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3836 {
3837         fib6_clean_all(net, fib6_clean_tohost, gateway);
3838 }
3839
3840 struct arg_netdev_event {
3841         const struct net_device *dev;
3842         union {
3843                 unsigned int nh_flags;
3844                 unsigned long event;
3845         };
3846 };
3847
3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3849 {
3850         struct fib6_info *iter;
3851         struct fib6_node *fn;
3852
3853         fn = rcu_dereference_protected(rt->fib6_node,
3854                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3855         iter = rcu_dereference_protected(fn->leaf,
3856                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3857         while (iter) {
3858                 if (iter->fib6_metric == rt->fib6_metric &&
3859                     rt6_qualify_for_ecmp(iter))
3860                         return iter;
3861                 iter = rcu_dereference_protected(iter->fib6_next,
3862                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3863         }
3864
3865         return NULL;
3866 }
3867
3868 static bool rt6_is_dead(const struct fib6_info *rt)
3869 {
3870         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3871             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3872              fib6_ignore_linkdown(rt)))
3873                 return true;
3874
3875         return false;
3876 }
3877
3878 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3879 {
3880         struct fib6_info *iter;
3881         int total = 0;
3882
3883         if (!rt6_is_dead(rt))
3884                 total += rt->fib6_nh.nh_weight;
3885
3886         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3887                 if (!rt6_is_dead(iter))
3888                         total += iter->fib6_nh.nh_weight;
3889         }
3890
3891         return total;
3892 }
3893
3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3895 {
3896         int upper_bound = -1;
3897
3898         if (!rt6_is_dead(rt)) {
3899                 *weight += rt->fib6_nh.nh_weight;
3900                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3901                                                     total) - 1;
3902         }
3903         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3904 }
3905
3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3907 {
3908         struct fib6_info *iter;
3909         int weight = 0;
3910
3911         rt6_upper_bound_set(rt, &weight, total);
3912
3913         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3914                 rt6_upper_bound_set(iter, &weight, total);
3915 }
3916
3917 void rt6_multipath_rebalance(struct fib6_info *rt)
3918 {
3919         struct fib6_info *first;
3920         int total;
3921
3922         /* In case the entire multipath route was marked for flushing,
3923          * then there is no need to rebalance upon the removal of every
3924          * sibling route.
3925          */
3926         if (!rt->fib6_nsiblings || rt->should_flush)
3927                 return;
3928
3929         /* During lookup routes are evaluated in order, so we need to
3930          * make sure upper bounds are assigned from the first sibling
3931          * onwards.
3932          */
3933         first = rt6_multipath_first_sibling(rt);
3934         if (WARN_ON_ONCE(!first))
3935                 return;
3936
3937         total = rt6_multipath_total_weight(first);
3938         rt6_multipath_upper_bound_set(first, total);
3939 }
3940
3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3942 {
3943         const struct arg_netdev_event *arg = p_arg;
3944         struct net *net = dev_net(arg->dev);
3945
3946         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3947                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3948                 fib6_update_sernum_upto_root(net, rt);
3949                 rt6_multipath_rebalance(rt);
3950         }
3951
3952         return 0;
3953 }
3954
3955 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3956 {
3957         struct arg_netdev_event arg = {
3958                 .dev = dev,
3959                 {
3960                         .nh_flags = nh_flags,
3961                 },
3962         };
3963
3964         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3965                 arg.nh_flags |= RTNH_F_LINKDOWN;
3966
3967         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3968 }
3969
3970 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3971                                    const struct net_device *dev)
3972 {
3973         struct fib6_info *iter;
3974
3975         if (rt->fib6_nh.nh_dev == dev)
3976                 return true;
3977         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3978                 if (iter->fib6_nh.nh_dev == dev)
3979                         return true;
3980
3981         return false;
3982 }
3983
3984 static void rt6_multipath_flush(struct fib6_info *rt)
3985 {
3986         struct fib6_info *iter;
3987
3988         rt->should_flush = 1;
3989         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990                 iter->should_flush = 1;
3991 }
3992
3993 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3994                                              const struct net_device *down_dev)
3995 {
3996         struct fib6_info *iter;
3997         unsigned int dead = 0;
3998
3999         if (rt->fib6_nh.nh_dev == down_dev ||
4000             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4001                 dead++;
4002         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4003                 if (iter->fib6_nh.nh_dev == down_dev ||
4004                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4005                         dead++;
4006
4007         return dead;
4008 }
4009
4010 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4011                                        const struct net_device *dev,
4012                                        unsigned int nh_flags)
4013 {
4014         struct fib6_info *iter;
4015
4016         if (rt->fib6_nh.nh_dev == dev)
4017                 rt->fib6_nh.nh_flags |= nh_flags;
4018         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019                 if (iter->fib6_nh.nh_dev == dev)
4020                         iter->fib6_nh.nh_flags |= nh_flags;
4021 }
4022
4023 /* called with write lock held for table with rt */
4024 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4025 {
4026         const struct arg_netdev_event *arg = p_arg;
4027         const struct net_device *dev = arg->dev;
4028         struct net *net = dev_net(dev);
4029
4030         if (rt == net->ipv6.fib6_null_entry)
4031                 return 0;
4032
4033         switch (arg->event) {
4034         case NETDEV_UNREGISTER:
4035                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4036         case NETDEV_DOWN:
4037                 if (rt->should_flush)
4038                         return -1;
4039                 if (!rt->fib6_nsiblings)
4040                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4041                 if (rt6_multipath_uses_dev(rt, dev)) {
4042                         unsigned int count;
4043
4044                         count = rt6_multipath_dead_count(rt, dev);
4045                         if (rt->fib6_nsiblings + 1 == count) {
4046                                 rt6_multipath_flush(rt);
4047                                 return -1;
4048                         }
4049                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4050                                                    RTNH_F_LINKDOWN);
4051                         fib6_update_sernum(net, rt);
4052                         rt6_multipath_rebalance(rt);
4053                 }
4054                 return -2;
4055         case NETDEV_CHANGE:
4056                 if (rt->fib6_nh.nh_dev != dev ||
4057                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4058                         break;
4059                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4060                 rt6_multipath_rebalance(rt);
4061                 break;
4062         }
4063
4064         return 0;
4065 }
4066
4067 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4068 {
4069         struct arg_netdev_event arg = {
4070                 .dev = dev,
4071                 {
4072                         .event = event,
4073                 },
4074         };
4075         struct net *net = dev_net(dev);
4076
4077         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4078                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4079         else
4080                 fib6_clean_all(net, fib6_ifdown, &arg);
4081 }
4082
4083 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4084 {
4085         rt6_sync_down_dev(dev, event);
4086         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4087         neigh_ifdown(&nd_tbl, dev);
4088 }
4089
4090 struct rt6_mtu_change_arg {
4091         struct net_device *dev;
4092         unsigned int mtu;
4093 };
4094
4095 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4096 {
4097         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4098         struct inet6_dev *idev;
4099
4100         /* In IPv6 pmtu discovery is not optional,
4101            so that RTAX_MTU lock cannot disable it.
4102            We still use this lock to block changes
4103            caused by addrconf/ndisc.
4104         */
4105
4106         idev = __in6_dev_get(arg->dev);
4107         if (!idev)
4108                 return 0;
4109
4110         /* For administrative MTU increase, there is no way to discover
4111            IPv6 PMTU increase, so PMTU increase should be updated here.
4112            Since RFC 1981 doesn't include administrative MTU increase
4113            update PMTU increase is a MUST. (i.e. jumbo frame)
4114          */
4115         if (rt->fib6_nh.nh_dev == arg->dev &&
4116             !fib6_metric_locked(rt, RTAX_MTU)) {
4117                 u32 mtu = rt->fib6_pmtu;
4118
4119                 if (mtu >= arg->mtu ||
4120                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4121                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4122
4123                 spin_lock_bh(&rt6_exception_lock);
4124                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4125                 spin_unlock_bh(&rt6_exception_lock);
4126         }
4127         return 0;
4128 }
4129
4130 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4131 {
4132         struct rt6_mtu_change_arg arg = {
4133                 .dev = dev,
4134                 .mtu = mtu,
4135         };
4136
4137         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4138 }
4139
4140 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4141         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4142         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4143         [RTA_OIF]               = { .type = NLA_U32 },
4144         [RTA_IIF]               = { .type = NLA_U32 },
4145         [RTA_PRIORITY]          = { .type = NLA_U32 },
4146         [RTA_METRICS]           = { .type = NLA_NESTED },
4147         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4148         [RTA_PREF]              = { .type = NLA_U8 },
4149         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4150         [RTA_ENCAP]             = { .type = NLA_NESTED },
4151         [RTA_EXPIRES]           = { .type = NLA_U32 },
4152         [RTA_UID]               = { .type = NLA_U32 },
4153         [RTA_MARK]              = { .type = NLA_U32 },
4154         [RTA_TABLE]             = { .type = NLA_U32 },
4155         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4156         [RTA_SPORT]             = { .type = NLA_U16 },
4157         [RTA_DPORT]             = { .type = NLA_U16 },
4158 };
4159
4160 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4161                               struct fib6_config *cfg,
4162                               struct netlink_ext_ack *extack)
4163 {
4164         struct rtmsg *rtm;
4165         struct nlattr *tb[RTA_MAX+1];
4166         unsigned int pref;
4167         int err;
4168
4169         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4170                           extack);
4171         if (err < 0)
4172                 goto errout;
4173
4174         err = -EINVAL;
4175         rtm = nlmsg_data(nlh);
4176
4177         *cfg = (struct fib6_config){
4178                 .fc_table = rtm->rtm_table,
4179                 .fc_dst_len = rtm->rtm_dst_len,
4180                 .fc_src_len = rtm->rtm_src_len,
4181                 .fc_flags = RTF_UP,
4182                 .fc_protocol = rtm->rtm_protocol,
4183                 .fc_type = rtm->rtm_type,
4184
4185                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4186                 .fc_nlinfo.nlh = nlh,
4187                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4188         };
4189
4190         if (rtm->rtm_type == RTN_UNREACHABLE ||
4191             rtm->rtm_type == RTN_BLACKHOLE ||
4192             rtm->rtm_type == RTN_PROHIBIT ||
4193             rtm->rtm_type == RTN_THROW)
4194                 cfg->fc_flags |= RTF_REJECT;
4195
4196         if (rtm->rtm_type == RTN_LOCAL)
4197                 cfg->fc_flags |= RTF_LOCAL;
4198
4199         if (rtm->rtm_flags & RTM_F_CLONED)
4200                 cfg->fc_flags |= RTF_CACHE;
4201
4202         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4203
4204         if (tb[RTA_GATEWAY]) {
4205                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4206                 cfg->fc_flags |= RTF_GATEWAY;
4207         }
4208         if (tb[RTA_VIA]) {
4209                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4210                 goto errout;
4211         }
4212
4213         if (tb[RTA_DST]) {
4214                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4215
4216                 if (nla_len(tb[RTA_DST]) < plen)
4217                         goto errout;
4218
4219                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4220         }
4221
4222         if (tb[RTA_SRC]) {
4223                 int plen = (rtm->rtm_src_len + 7) >> 3;
4224
4225                 if (nla_len(tb[RTA_SRC]) < plen)
4226                         goto errout;
4227
4228                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4229         }
4230
4231         if (tb[RTA_PREFSRC])
4232                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4233
4234         if (tb[RTA_OIF])
4235                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4236
4237         if (tb[RTA_PRIORITY])
4238                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4239
4240         if (tb[RTA_METRICS]) {
4241                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4242                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4243         }
4244
4245         if (tb[RTA_TABLE])
4246                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4247
4248         if (tb[RTA_MULTIPATH]) {
4249                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4250                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4251
4252                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4253                                                      cfg->fc_mp_len, extack);
4254                 if (err < 0)
4255                         goto errout;
4256         }
4257
4258         if (tb[RTA_PREF]) {
4259                 pref = nla_get_u8(tb[RTA_PREF]);
4260                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4261                     pref != ICMPV6_ROUTER_PREF_HIGH)
4262                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4263                 cfg->fc_flags |= RTF_PREF(pref);
4264         }
4265
4266         if (tb[RTA_ENCAP])
4267                 cfg->fc_encap = tb[RTA_ENCAP];
4268
4269         if (tb[RTA_ENCAP_TYPE]) {
4270                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4271
4272                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4273                 if (err < 0)
4274                         goto errout;
4275         }
4276
4277         if (tb[RTA_EXPIRES]) {
4278                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4279
4280                 if (addrconf_finite_timeout(timeout)) {
4281                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4282                         cfg->fc_flags |= RTF_EXPIRES;
4283                 }
4284         }
4285
4286         err = 0;
4287 errout:
4288         return err;
4289 }
4290
4291 struct rt6_nh {
4292         struct fib6_info *fib6_info;
4293         struct fib6_config r_cfg;
4294         struct list_head next;
4295 };
4296
4297 static int ip6_route_info_append(struct net *net,
4298                                  struct list_head *rt6_nh_list,
4299                                  struct fib6_info *rt,
4300                                  struct fib6_config *r_cfg)
4301 {
4302         struct rt6_nh *nh;
4303         int err = -EEXIST;
4304
4305         list_for_each_entry(nh, rt6_nh_list, next) {
4306                 /* check if fib6_info already exists */
4307                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4308                         return err;
4309         }
4310
4311         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4312         if (!nh)
4313                 return -ENOMEM;
4314         nh->fib6_info = rt;
4315         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4316         list_add_tail(&nh->next, rt6_nh_list);
4317
4318         return 0;
4319 }
4320
4321 static void ip6_route_mpath_notify(struct fib6_info *rt,
4322                                    struct fib6_info *rt_last,
4323                                    struct nl_info *info,
4324                                    __u16 nlflags)
4325 {
4326         /* if this is an APPEND route, then rt points to the first route
4327          * inserted and rt_last points to last route inserted. Userspace
4328          * wants a consistent dump of the route which starts at the first
4329          * nexthop. Since sibling routes are always added at the end of
4330          * the list, find the first sibling of the last route appended
4331          */
4332         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4333                 rt = list_first_entry(&rt_last->fib6_siblings,
4334                                       struct fib6_info,
4335                                       fib6_siblings);
4336         }
4337
4338         if (rt)
4339                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4340 }
4341
4342 static int ip6_route_multipath_add(struct fib6_config *cfg,
4343                                    struct netlink_ext_ack *extack)
4344 {
4345         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4346         struct nl_info *info = &cfg->fc_nlinfo;
4347         struct fib6_config r_cfg;
4348         struct rtnexthop *rtnh;
4349         struct fib6_info *rt;
4350         struct rt6_nh *err_nh;
4351         struct rt6_nh *nh, *nh_safe;
4352         __u16 nlflags;
4353         int remaining;
4354         int attrlen;
4355         int err = 1;
4356         int nhn = 0;
4357         int replace = (cfg->fc_nlinfo.nlh &&
4358                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4359         LIST_HEAD(rt6_nh_list);
4360
4361         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4362         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4363                 nlflags |= NLM_F_APPEND;
4364
4365         remaining = cfg->fc_mp_len;
4366         rtnh = (struct rtnexthop *)cfg->fc_mp;
4367
4368         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4369          * fib6_info structs per nexthop
4370          */
4371         while (rtnh_ok(rtnh, remaining)) {
4372                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4373                 if (rtnh->rtnh_ifindex)
4374                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4375
4376                 attrlen = rtnh_attrlen(rtnh);
4377                 if (attrlen > 0) {
4378                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4379
4380                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4381                         if (nla) {
4382                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4383                                 r_cfg.fc_flags |= RTF_GATEWAY;
4384                         }
4385                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4386                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4387                         if (nla)
4388                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4389                 }
4390
4391                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4392                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4393                 if (IS_ERR(rt)) {
4394                         err = PTR_ERR(rt);
4395                         rt = NULL;
4396                         goto cleanup;
4397                 }
4398                 if (!rt6_qualify_for_ecmp(rt)) {
4399                         err = -EINVAL;
4400                         NL_SET_ERR_MSG(extack,
4401                                        "Device only routes can not be added for IPv6 using the multipath API.");
4402                         fib6_info_release(rt);
4403                         goto cleanup;
4404                 }
4405
4406                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4407
4408                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4409                                             rt, &r_cfg);
4410                 if (err) {
4411                         fib6_info_release(rt);
4412                         goto cleanup;
4413                 }
4414
4415                 rtnh = rtnh_next(rtnh, &remaining);
4416         }
4417
4418         /* for add and replace send one notification with all nexthops.
4419          * Skip the notification in fib6_add_rt2node and send one with
4420          * the full route when done
4421          */
4422         info->skip_notify = 1;
4423
4424         err_nh = NULL;
4425         list_for_each_entry(nh, &rt6_nh_list, next) {
4426                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4427                 fib6_info_release(nh->fib6_info);
4428
4429                 if (!err) {
4430                         /* save reference to last route successfully inserted */
4431                         rt_last = nh->fib6_info;
4432
4433                         /* save reference to first route for notification */
4434                         if (!rt_notif)
4435                                 rt_notif = nh->fib6_info;
4436                 }
4437
4438                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4439                 nh->fib6_info = NULL;
4440                 if (err) {
4441                         if (replace && nhn)
4442                                 NL_SET_ERR_MSG_MOD(extack,
4443                                                    "multipath route replace failed (check consistency of installed routes)");
4444                         err_nh = nh;
4445                         goto add_errout;
4446                 }
4447
4448                 /* Because each route is added like a single route we remove
4449                  * these flags after the first nexthop: if there is a collision,
4450                  * we have already failed to add the first nexthop:
4451                  * fib6_add_rt2node() has rejected it; when replacing, old
4452                  * nexthops have been replaced by first new, the rest should
4453                  * be added to it.
4454                  */
4455                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4456                                                      NLM_F_REPLACE);
4457                 nhn++;
4458         }
4459
4460         /* success ... tell user about new route */
4461         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4462         goto cleanup;
4463
4464 add_errout:
4465         /* send notification for routes that were added so that
4466          * the delete notifications sent by ip6_route_del are
4467          * coherent
4468          */
4469         if (rt_notif)
4470                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4471
4472         /* Delete routes that were already added */
4473         list_for_each_entry(nh, &rt6_nh_list, next) {
4474                 if (err_nh == nh)
4475                         break;
4476                 ip6_route_del(&nh->r_cfg, extack);
4477         }
4478
4479 cleanup:
4480         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4481                 if (nh->fib6_info)
4482                         fib6_info_release(nh->fib6_info);
4483                 list_del(&nh->next);
4484                 kfree(nh);
4485         }
4486
4487         return err;
4488 }
4489
4490 static int ip6_route_multipath_del(struct fib6_config *cfg,
4491                                    struct netlink_ext_ack *extack)
4492 {
4493         struct fib6_config r_cfg;
4494         struct rtnexthop *rtnh;
4495         int remaining;
4496         int attrlen;
4497         int err = 1, last_err = 0;
4498
4499         remaining = cfg->fc_mp_len;
4500         rtnh = (struct rtnexthop *)cfg->fc_mp;
4501
4502         /* Parse a Multipath Entry */
4503         while (rtnh_ok(rtnh, remaining)) {
4504                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4505                 if (rtnh->rtnh_ifindex)
4506                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4507
4508                 attrlen = rtnh_attrlen(rtnh);
4509                 if (attrlen > 0) {
4510                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4511
4512                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4513                         if (nla) {
4514                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4515                                 r_cfg.fc_flags |= RTF_GATEWAY;
4516                         }
4517                 }
4518                 err = ip6_route_del(&r_cfg, extack);
4519                 if (err)
4520                         last_err = err;
4521
4522                 rtnh = rtnh_next(rtnh, &remaining);
4523         }
4524
4525         return last_err;
4526 }
4527
4528 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4529                               struct netlink_ext_ack *extack)
4530 {
4531         struct fib6_config cfg;
4532         int err;
4533
4534         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4535         if (err < 0)
4536                 return err;
4537
4538         if (cfg.fc_mp)
4539                 return ip6_route_multipath_del(&cfg, extack);
4540         else {
4541                 cfg.fc_delete_all_nh = 1;
4542                 return ip6_route_del(&cfg, extack);
4543         }
4544 }
4545
4546 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4547                               struct netlink_ext_ack *extack)
4548 {
4549         struct fib6_config cfg;
4550         int err;
4551
4552         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4553         if (err < 0)
4554                 return err;
4555
4556         if (cfg.fc_metric == 0)
4557                 cfg.fc_metric = IP6_RT_PRIO_USER;
4558
4559         if (cfg.fc_mp)
4560                 return ip6_route_multipath_add(&cfg, extack);
4561         else
4562                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4563 }
4564
4565 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4566 {
4567         int nexthop_len = 0;
4568
4569         if (rt->fib6_nsiblings) {
4570                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4571                             + NLA_ALIGN(sizeof(struct rtnexthop))
4572                             + nla_total_size(16) /* RTA_GATEWAY */
4573                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4574
4575                 nexthop_len *= rt->fib6_nsiblings;
4576         }
4577
4578         return NLMSG_ALIGN(sizeof(struct rtmsg))
4579                + nla_total_size(16) /* RTA_SRC */
4580                + nla_total_size(16) /* RTA_DST */
4581                + nla_total_size(16) /* RTA_GATEWAY */
4582                + nla_total_size(16) /* RTA_PREFSRC */
4583                + nla_total_size(4) /* RTA_TABLE */
4584                + nla_total_size(4) /* RTA_IIF */
4585                + nla_total_size(4) /* RTA_OIF */
4586                + nla_total_size(4) /* RTA_PRIORITY */
4587                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4588                + nla_total_size(sizeof(struct rta_cacheinfo))
4589                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4590                + nla_total_size(1) /* RTA_PREF */
4591                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4592                + nexthop_len;
4593 }
4594
4595 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4596                             unsigned int *flags, bool skip_oif)
4597 {
4598         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4599                 *flags |= RTNH_F_DEAD;
4600
4601         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4602                 *flags |= RTNH_F_LINKDOWN;
4603
4604                 rcu_read_lock();
4605                 if (fib6_ignore_linkdown(rt))
4606                         *flags |= RTNH_F_DEAD;
4607                 rcu_read_unlock();
4608         }
4609
4610         if (rt->fib6_flags & RTF_GATEWAY) {
4611                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4612                         goto nla_put_failure;
4613         }
4614
4615         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4616         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4617                 *flags |= RTNH_F_OFFLOAD;
4618
4619         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4620         if (!skip_oif && rt->fib6_nh.nh_dev &&
4621             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4622                 goto nla_put_failure;
4623
4624         if (rt->fib6_nh.nh_lwtstate &&
4625             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4626                 goto nla_put_failure;
4627
4628         return 0;
4629
4630 nla_put_failure:
4631         return -EMSGSIZE;
4632 }
4633
4634 /* add multipath next hop */
4635 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4636 {
4637         const struct net_device *dev = rt->fib6_nh.nh_dev;
4638         struct rtnexthop *rtnh;
4639         unsigned int flags = 0;
4640
4641         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4642         if (!rtnh)
4643                 goto nla_put_failure;
4644
4645         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4646         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4647
4648         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4649                 goto nla_put_failure;
4650
4651         rtnh->rtnh_flags = flags;
4652
4653         /* length of rtnetlink header + attributes */
4654         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4655
4656         return 0;
4657
4658 nla_put_failure:
4659         return -EMSGSIZE;
4660 }
4661
4662 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4663                          struct fib6_info *rt, struct dst_entry *dst,
4664                          struct in6_addr *dest, struct in6_addr *src,
4665                          int iif, int type, u32 portid, u32 seq,
4666                          unsigned int flags)
4667 {
4668         struct rt6_info *rt6 = (struct rt6_info *)dst;
4669         struct rt6key *rt6_dst, *rt6_src;
4670         u32 *pmetrics, table, rt6_flags;
4671         struct nlmsghdr *nlh;
4672         struct rtmsg *rtm;
4673         long expires = 0;
4674
4675         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4676         if (!nlh)
4677                 return -EMSGSIZE;
4678
4679         if (rt6) {
4680                 rt6_dst = &rt6->rt6i_dst;
4681                 rt6_src = &rt6->rt6i_src;
4682                 rt6_flags = rt6->rt6i_flags;
4683         } else {
4684                 rt6_dst = &rt->fib6_dst;
4685                 rt6_src = &rt->fib6_src;
4686                 rt6_flags = rt->fib6_flags;
4687         }
4688
4689         rtm = nlmsg_data(nlh);
4690         rtm->rtm_family = AF_INET6;
4691         rtm->rtm_dst_len = rt6_dst->plen;
4692         rtm->rtm_src_len = rt6_src->plen;
4693         rtm->rtm_tos = 0;
4694         if (rt->fib6_table)
4695                 table = rt->fib6_table->tb6_id;
4696         else
4697                 table = RT6_TABLE_UNSPEC;
4698         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4699         if (nla_put_u32(skb, RTA_TABLE, table))
4700                 goto nla_put_failure;
4701
4702         rtm->rtm_type = rt->fib6_type;
4703         rtm->rtm_flags = 0;
4704         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4705         rtm->rtm_protocol = rt->fib6_protocol;
4706
4707         if (rt6_flags & RTF_CACHE)
4708                 rtm->rtm_flags |= RTM_F_CLONED;
4709
4710         if (dest) {
4711                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4712                         goto nla_put_failure;
4713                 rtm->rtm_dst_len = 128;
4714         } else if (rtm->rtm_dst_len)
4715                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4716                         goto nla_put_failure;
4717 #ifdef CONFIG_IPV6_SUBTREES
4718         if (src) {
4719                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4720                         goto nla_put_failure;
4721                 rtm->rtm_src_len = 128;
4722         } else if (rtm->rtm_src_len &&
4723                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4724                 goto nla_put_failure;
4725 #endif
4726         if (iif) {
4727 #ifdef CONFIG_IPV6_MROUTE
4728                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4729                         int err = ip6mr_get_route(net, skb, rtm, portid);
4730
4731                         if (err == 0)
4732                                 return 0;
4733                         if (err < 0)
4734                                 goto nla_put_failure;
4735                 } else
4736 #endif
4737                         if (nla_put_u32(skb, RTA_IIF, iif))
4738                                 goto nla_put_failure;
4739         } else if (dest) {
4740                 struct in6_addr saddr_buf;
4741                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4742                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4743                         goto nla_put_failure;
4744         }
4745
4746         if (rt->fib6_prefsrc.plen) {
4747                 struct in6_addr saddr_buf;
4748                 saddr_buf = rt->fib6_prefsrc.addr;
4749                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4750                         goto nla_put_failure;
4751         }
4752
4753         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4754         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4755                 goto nla_put_failure;
4756
4757         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4758                 goto nla_put_failure;
4759
4760         /* For multipath routes, walk the siblings list and add
4761          * each as a nexthop within RTA_MULTIPATH.
4762          */
4763         if (rt6) {
4764                 if (rt6_flags & RTF_GATEWAY &&
4765                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4766                         goto nla_put_failure;
4767
4768                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4769                         goto nla_put_failure;
4770         } else if (rt->fib6_nsiblings) {
4771                 struct fib6_info *sibling, *next_sibling;
4772                 struct nlattr *mp;
4773
4774                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4775                 if (!mp)
4776                         goto nla_put_failure;
4777
4778                 if (rt6_add_nexthop(skb, rt) < 0)
4779                         goto nla_put_failure;
4780
4781                 list_for_each_entry_safe(sibling, next_sibling,
4782                                          &rt->fib6_siblings, fib6_siblings) {
4783                         if (rt6_add_nexthop(skb, sibling) < 0)
4784                                 goto nla_put_failure;
4785                 }
4786
4787                 nla_nest_end(skb, mp);
4788         } else {
4789                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4790                         goto nla_put_failure;
4791         }
4792
4793         if (rt6_flags & RTF_EXPIRES) {
4794                 expires = dst ? dst->expires : rt->expires;
4795                 expires -= jiffies;
4796         }
4797
4798         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4799                 goto nla_put_failure;
4800
4801         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4802                 goto nla_put_failure;
4803
4804
4805         nlmsg_end(skb, nlh);
4806         return 0;
4807
4808 nla_put_failure:
4809         nlmsg_cancel(skb, nlh);
4810         return -EMSGSIZE;
4811 }
4812
4813 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4814                                const struct net_device *dev)
4815 {
4816         if (f6i->fib6_nh.nh_dev == dev)
4817                 return true;
4818
4819         if (f6i->fib6_nsiblings) {
4820                 struct fib6_info *sibling, *next_sibling;
4821
4822                 list_for_each_entry_safe(sibling, next_sibling,
4823                                          &f6i->fib6_siblings, fib6_siblings) {
4824                         if (sibling->fib6_nh.nh_dev == dev)
4825                                 return true;
4826                 }
4827         }
4828
4829         return false;
4830 }
4831
4832 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4833 {
4834         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4835         struct fib_dump_filter *filter = &arg->filter;
4836         unsigned int flags = NLM_F_MULTI;
4837         struct net *net = arg->net;
4838
4839         if (rt == net->ipv6.fib6_null_entry)
4840                 return 0;
4841
4842         if ((filter->flags & RTM_F_PREFIX) &&
4843             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4844                 /* success since this is not a prefix route */
4845                 return 1;
4846         }
4847         if (filter->filter_set) {
4848                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4849                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4850                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4851                         return 1;
4852                 }
4853                 flags |= NLM_F_DUMP_FILTERED;
4854         }
4855
4856         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4857                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4858                              arg->cb->nlh->nlmsg_seq, flags);
4859 }
4860
4861 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4862                                         const struct nlmsghdr *nlh,
4863                                         struct nlattr **tb,
4864                                         struct netlink_ext_ack *extack)
4865 {
4866         struct rtmsg *rtm;
4867         int i, err;
4868
4869         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4870                 NL_SET_ERR_MSG_MOD(extack,
4871                                    "Invalid header for get route request");
4872                 return -EINVAL;
4873         }
4874
4875         if (!netlink_strict_get_check(skb))
4876                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4877                                    rtm_ipv6_policy, extack);
4878
4879         rtm = nlmsg_data(nlh);
4880         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4881             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4882             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4883             rtm->rtm_type) {
4884                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4885                 return -EINVAL;
4886         }
4887         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4888                 NL_SET_ERR_MSG_MOD(extack,
4889                                    "Invalid flags for get route request");
4890                 return -EINVAL;
4891         }
4892
4893         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4894                                  rtm_ipv6_policy, extack);
4895         if (err)
4896                 return err;
4897
4898         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4899             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4900                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4901                 return -EINVAL;
4902         }
4903
4904         for (i = 0; i <= RTA_MAX; i++) {
4905                 if (!tb[i])
4906                         continue;
4907
4908                 switch (i) {
4909                 case RTA_SRC:
4910                 case RTA_DST:
4911                 case RTA_IIF:
4912                 case RTA_OIF:
4913                 case RTA_MARK:
4914                 case RTA_UID:
4915                 case RTA_SPORT:
4916                 case RTA_DPORT:
4917                 case RTA_IP_PROTO:
4918                         break;
4919                 default:
4920                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4921                         return -EINVAL;
4922                 }
4923         }
4924
4925         return 0;
4926 }
4927
4928 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4929                               struct netlink_ext_ack *extack)
4930 {
4931         struct net *net = sock_net(in_skb->sk);
4932         struct nlattr *tb[RTA_MAX+1];
4933         int err, iif = 0, oif = 0;
4934         struct fib6_info *from;
4935         struct dst_entry *dst;
4936         struct rt6_info *rt;
4937         struct sk_buff *skb;
4938         struct rtmsg *rtm;
4939         struct flowi6 fl6 = {};
4940         bool fibmatch;
4941
4942         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4943         if (err < 0)
4944                 goto errout;
4945
4946         err = -EINVAL;
4947         rtm = nlmsg_data(nlh);
4948         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4949         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4950
4951         if (tb[RTA_SRC]) {
4952                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4953                         goto errout;
4954
4955                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4956         }
4957
4958         if (tb[RTA_DST]) {
4959                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4960                         goto errout;
4961
4962                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4963         }
4964
4965         if (tb[RTA_IIF])
4966                 iif = nla_get_u32(tb[RTA_IIF]);
4967
4968         if (tb[RTA_OIF])
4969                 oif = nla_get_u32(tb[RTA_OIF]);
4970
4971         if (tb[RTA_MARK])
4972                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4973
4974         if (tb[RTA_UID])
4975                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4976                                            nla_get_u32(tb[RTA_UID]));
4977         else
4978                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4979
4980         if (tb[RTA_SPORT])
4981                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4982
4983         if (tb[RTA_DPORT])
4984                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4985
4986         if (tb[RTA_IP_PROTO]) {
4987                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4988                                                   &fl6.flowi6_proto, AF_INET6,
4989                                                   extack);
4990                 if (err)
4991                         goto errout;
4992         }
4993
4994         if (iif) {
4995                 struct net_device *dev;
4996                 int flags = 0;
4997
4998                 rcu_read_lock();
4999
5000                 dev = dev_get_by_index_rcu(net, iif);
5001                 if (!dev) {
5002                         rcu_read_unlock();
5003                         err = -ENODEV;
5004                         goto errout;
5005                 }
5006
5007                 fl6.flowi6_iif = iif;
5008
5009                 if (!ipv6_addr_any(&fl6.saddr))
5010                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5011
5012                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5013
5014                 rcu_read_unlock();
5015         } else {
5016                 fl6.flowi6_oif = oif;
5017
5018                 dst = ip6_route_output(net, NULL, &fl6);
5019         }
5020
5021
5022         rt = container_of(dst, struct rt6_info, dst);
5023         if (rt->dst.error) {
5024                 err = rt->dst.error;
5025                 ip6_rt_put(rt);
5026                 goto errout;
5027         }
5028
5029         if (rt == net->ipv6.ip6_null_entry) {
5030                 err = rt->dst.error;
5031                 ip6_rt_put(rt);
5032                 goto errout;
5033         }
5034
5035         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5036         if (!skb) {
5037                 ip6_rt_put(rt);
5038                 err = -ENOBUFS;
5039                 goto errout;
5040         }
5041
5042         skb_dst_set(skb, &rt->dst);
5043
5044         rcu_read_lock();
5045         from = rcu_dereference(rt->from);
5046
5047         if (fibmatch)
5048                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5049                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5050                                     nlh->nlmsg_seq, 0);
5051         else
5052                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5053                                     &fl6.saddr, iif, RTM_NEWROUTE,
5054                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5055                                     0);
5056         rcu_read_unlock();
5057
5058         if (err < 0) {
5059                 kfree_skb(skb);
5060                 goto errout;
5061         }
5062
5063         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5064 errout:
5065         return err;
5066 }
5067
5068 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5069                      unsigned int nlm_flags)
5070 {
5071         struct sk_buff *skb;
5072         struct net *net = info->nl_net;
5073         u32 seq;
5074         int err;
5075
5076         err = -ENOBUFS;
5077         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5078
5079         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5080         if (!skb)
5081                 goto errout;
5082
5083         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5084                             event, info->portid, seq, nlm_flags);
5085         if (err < 0) {
5086                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5087                 WARN_ON(err == -EMSGSIZE);
5088                 kfree_skb(skb);
5089                 goto errout;
5090         }
5091         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5092                     info->nlh, gfp_any());
5093         return;
5094 errout:
5095         if (err < 0)
5096                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5097 }
5098
5099 static int ip6_route_dev_notify(struct notifier_block *this,
5100                                 unsigned long event, void *ptr)
5101 {
5102         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5103         struct net *net = dev_net(dev);
5104
5105         if (!(dev->flags & IFF_LOOPBACK))
5106                 return NOTIFY_OK;
5107
5108         if (event == NETDEV_REGISTER) {
5109                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5110                 net->ipv6.ip6_null_entry->dst.dev = dev;
5111                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5114                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5115                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5116                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5117 #endif
5118          } else if (event == NETDEV_UNREGISTER &&
5119                     dev->reg_state != NETREG_UNREGISTERED) {
5120                 /* NETDEV_UNREGISTER could be fired for multiple times by
5121                  * netdev_wait_allrefs(). Make sure we only call this once.
5122                  */
5123                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5124 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5125                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5126                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5127 #endif
5128         }
5129
5130         return NOTIFY_OK;
5131 }
5132
5133 /*
5134  *      /proc
5135  */
5136
5137 #ifdef CONFIG_PROC_FS
5138 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5139 {
5140         struct net *net = (struct net *)seq->private;
5141         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5142                    net->ipv6.rt6_stats->fib_nodes,
5143                    net->ipv6.rt6_stats->fib_route_nodes,
5144                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5145                    net->ipv6.rt6_stats->fib_rt_entries,
5146                    net->ipv6.rt6_stats->fib_rt_cache,
5147                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5148                    net->ipv6.rt6_stats->fib_discarded_routes);
5149
5150         return 0;
5151 }
5152 #endif  /* CONFIG_PROC_FS */
5153
5154 #ifdef CONFIG_SYSCTL
5155
5156 static
5157 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5158                               void __user *buffer, size_t *lenp, loff_t *ppos)
5159 {
5160         struct net *net;
5161         int delay;
5162         int ret;
5163         if (!write)
5164                 return -EINVAL;
5165
5166         net = (struct net *)ctl->extra1;
5167         delay = net->ipv6.sysctl.flush_delay;
5168         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5169         if (ret)
5170                 return ret;
5171
5172         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5173         return 0;
5174 }
5175
5176 static int zero;
5177 static int one = 1;
5178
5179 static struct ctl_table ipv6_route_table_template[] = {
5180         {
5181                 .procname       =       "flush",
5182                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5183                 .maxlen         =       sizeof(int),
5184                 .mode           =       0200,
5185                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5186         },
5187         {
5188                 .procname       =       "gc_thresh",
5189                 .data           =       &ip6_dst_ops_template.gc_thresh,
5190                 .maxlen         =       sizeof(int),
5191                 .mode           =       0644,
5192                 .proc_handler   =       proc_dointvec,
5193         },
5194         {
5195                 .procname       =       "max_size",
5196                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5197                 .maxlen         =       sizeof(int),
5198                 .mode           =       0644,
5199                 .proc_handler   =       proc_dointvec,
5200         },
5201         {
5202                 .procname       =       "gc_min_interval",
5203                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5204                 .maxlen         =       sizeof(int),
5205                 .mode           =       0644,
5206                 .proc_handler   =       proc_dointvec_jiffies,
5207         },
5208         {
5209                 .procname       =       "gc_timeout",
5210                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5211                 .maxlen         =       sizeof(int),
5212                 .mode           =       0644,
5213                 .proc_handler   =       proc_dointvec_jiffies,
5214         },
5215         {
5216                 .procname       =       "gc_interval",
5217                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5218                 .maxlen         =       sizeof(int),
5219                 .mode           =       0644,
5220                 .proc_handler   =       proc_dointvec_jiffies,
5221         },
5222         {
5223                 .procname       =       "gc_elasticity",
5224                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5225                 .maxlen         =       sizeof(int),
5226                 .mode           =       0644,
5227                 .proc_handler   =       proc_dointvec,
5228         },
5229         {
5230                 .procname       =       "mtu_expires",
5231                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5232                 .maxlen         =       sizeof(int),
5233                 .mode           =       0644,
5234                 .proc_handler   =       proc_dointvec_jiffies,
5235         },
5236         {
5237                 .procname       =       "min_adv_mss",
5238                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5239                 .maxlen         =       sizeof(int),
5240                 .mode           =       0644,
5241                 .proc_handler   =       proc_dointvec,
5242         },
5243         {
5244                 .procname       =       "gc_min_interval_ms",
5245                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5246                 .maxlen         =       sizeof(int),
5247                 .mode           =       0644,
5248                 .proc_handler   =       proc_dointvec_ms_jiffies,
5249         },
5250         {
5251                 .procname       =       "skip_notify_on_dev_down",
5252                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5253                 .maxlen         =       sizeof(int),
5254                 .mode           =       0644,
5255                 .proc_handler   =       proc_dointvec,
5256                 .extra1         =       &zero,
5257                 .extra2         =       &one,
5258         },
5259         { }
5260 };
5261
5262 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5263 {
5264         struct ctl_table *table;
5265
5266         table = kmemdup(ipv6_route_table_template,
5267                         sizeof(ipv6_route_table_template),
5268                         GFP_KERNEL);
5269
5270         if (table) {
5271                 table[0].data = &net->ipv6.sysctl.flush_delay;
5272                 table[0].extra1 = net;
5273                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5274                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5275                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5276                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5277                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5278                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5279                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5280                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5281                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5282                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5283
5284                 /* Don't export sysctls to unprivileged users */
5285                 if (net->user_ns != &init_user_ns)
5286                         table[0].procname = NULL;
5287         }
5288
5289         return table;
5290 }
5291 #endif
5292
5293 static int __net_init ip6_route_net_init(struct net *net)
5294 {
5295         int ret = -ENOMEM;
5296
5297         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5298                sizeof(net->ipv6.ip6_dst_ops));
5299
5300         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5301                 goto out_ip6_dst_ops;
5302
5303         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5304                                             sizeof(*net->ipv6.fib6_null_entry),
5305                                             GFP_KERNEL);
5306         if (!net->ipv6.fib6_null_entry)
5307                 goto out_ip6_dst_entries;
5308
5309         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5310                                            sizeof(*net->ipv6.ip6_null_entry),
5311                                            GFP_KERNEL);
5312         if (!net->ipv6.ip6_null_entry)
5313                 goto out_fib6_null_entry;
5314         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5315         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5316                          ip6_template_metrics, true);
5317
5318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5319         net->ipv6.fib6_has_custom_rules = false;
5320         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5321                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5322                                                GFP_KERNEL);
5323         if (!net->ipv6.ip6_prohibit_entry)
5324                 goto out_ip6_null_entry;
5325         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5326         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5327                          ip6_template_metrics, true);
5328
5329         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5330                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5331                                                GFP_KERNEL);
5332         if (!net->ipv6.ip6_blk_hole_entry)
5333                 goto out_ip6_prohibit_entry;
5334         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5335         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5336                          ip6_template_metrics, true);
5337 #endif
5338
5339         net->ipv6.sysctl.flush_delay = 0;
5340         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5341         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5342         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5343         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5344         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5345         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5346         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5347         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5348
5349         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5350
5351         ret = 0;
5352 out:
5353         return ret;
5354
5355 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5356 out_ip6_prohibit_entry:
5357         kfree(net->ipv6.ip6_prohibit_entry);
5358 out_ip6_null_entry:
5359         kfree(net->ipv6.ip6_null_entry);
5360 #endif
5361 out_fib6_null_entry:
5362         kfree(net->ipv6.fib6_null_entry);
5363 out_ip6_dst_entries:
5364         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5365 out_ip6_dst_ops:
5366         goto out;
5367 }
5368
5369 static void __net_exit ip6_route_net_exit(struct net *net)
5370 {
5371         kfree(net->ipv6.fib6_null_entry);
5372         kfree(net->ipv6.ip6_null_entry);
5373 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5374         kfree(net->ipv6.ip6_prohibit_entry);
5375         kfree(net->ipv6.ip6_blk_hole_entry);
5376 #endif
5377         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5378 }
5379
5380 static int __net_init ip6_route_net_init_late(struct net *net)
5381 {
5382 #ifdef CONFIG_PROC_FS
5383         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5384                         sizeof(struct ipv6_route_iter));
5385         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5386                         rt6_stats_seq_show, NULL);
5387 #endif
5388         return 0;
5389 }
5390
5391 static void __net_exit ip6_route_net_exit_late(struct net *net)
5392 {
5393 #ifdef CONFIG_PROC_FS
5394         remove_proc_entry("ipv6_route", net->proc_net);
5395         remove_proc_entry("rt6_stats", net->proc_net);
5396 #endif
5397 }
5398
5399 static struct pernet_operations ip6_route_net_ops = {
5400         .init = ip6_route_net_init,
5401         .exit = ip6_route_net_exit,
5402 };
5403
5404 static int __net_init ipv6_inetpeer_init(struct net *net)
5405 {
5406         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5407
5408         if (!bp)
5409                 return -ENOMEM;
5410         inet_peer_base_init(bp);
5411         net->ipv6.peers = bp;
5412         return 0;
5413 }
5414
5415 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5416 {
5417         struct inet_peer_base *bp = net->ipv6.peers;
5418
5419         net->ipv6.peers = NULL;
5420         inetpeer_invalidate_tree(bp);
5421         kfree(bp);
5422 }
5423
5424 static struct pernet_operations ipv6_inetpeer_ops = {
5425         .init   =       ipv6_inetpeer_init,
5426         .exit   =       ipv6_inetpeer_exit,
5427 };
5428
5429 static struct pernet_operations ip6_route_net_late_ops = {
5430         .init = ip6_route_net_init_late,
5431         .exit = ip6_route_net_exit_late,
5432 };
5433
5434 static struct notifier_block ip6_route_dev_notifier = {
5435         .notifier_call = ip6_route_dev_notify,
5436         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5437 };
5438
5439 void __init ip6_route_init_special_entries(void)
5440 {
5441         /* Registering of the loopback is done before this portion of code,
5442          * the loopback reference in rt6_info will not be taken, do it
5443          * manually for init_net */
5444         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5445         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5446         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5447   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5448         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5449         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5450         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5451         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5452   #endif
5453 }
5454
5455 int __init ip6_route_init(void)
5456 {
5457         int ret;
5458         int cpu;
5459
5460         ret = -ENOMEM;
5461         ip6_dst_ops_template.kmem_cachep =
5462                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5463                                   SLAB_HWCACHE_ALIGN, NULL);
5464         if (!ip6_dst_ops_template.kmem_cachep)
5465                 goto out;
5466
5467         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5468         if (ret)
5469                 goto out_kmem_cache;
5470
5471         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5472         if (ret)
5473                 goto out_dst_entries;
5474
5475         ret = register_pernet_subsys(&ip6_route_net_ops);
5476         if (ret)
5477                 goto out_register_inetpeer;
5478
5479         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5480
5481         ret = fib6_init();
5482         if (ret)
5483                 goto out_register_subsys;
5484
5485         ret = xfrm6_init();
5486         if (ret)
5487                 goto out_fib6_init;
5488
5489         ret = fib6_rules_init();
5490         if (ret)
5491                 goto xfrm6_init;
5492
5493         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5494         if (ret)
5495                 goto fib6_rules_init;
5496
5497         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5498                                    inet6_rtm_newroute, NULL, 0);
5499         if (ret < 0)
5500                 goto out_register_late_subsys;
5501
5502         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5503                                    inet6_rtm_delroute, NULL, 0);
5504         if (ret < 0)
5505                 goto out_register_late_subsys;
5506
5507         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5508                                    inet6_rtm_getroute, NULL,
5509                                    RTNL_FLAG_DOIT_UNLOCKED);
5510         if (ret < 0)
5511                 goto out_register_late_subsys;
5512
5513         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5514         if (ret)
5515                 goto out_register_late_subsys;
5516
5517         for_each_possible_cpu(cpu) {
5518                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5519
5520                 INIT_LIST_HEAD(&ul->head);
5521                 spin_lock_init(&ul->lock);
5522         }
5523
5524 out:
5525         return ret;
5526
5527 out_register_late_subsys:
5528         rtnl_unregister_all(PF_INET6);
5529         unregister_pernet_subsys(&ip6_route_net_late_ops);
5530 fib6_rules_init:
5531         fib6_rules_cleanup();
5532 xfrm6_init:
5533         xfrm6_fini();
5534 out_fib6_init:
5535         fib6_gc_cleanup();
5536 out_register_subsys:
5537         unregister_pernet_subsys(&ip6_route_net_ops);
5538 out_register_inetpeer:
5539         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5540 out_dst_entries:
5541         dst_entries_destroy(&ip6_dst_blackhole_ops);
5542 out_kmem_cache:
5543         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5544         goto out;
5545 }
5546
5547 void ip6_route_cleanup(void)
5548 {
5549         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5550         unregister_pernet_subsys(&ip6_route_net_late_ops);
5551         fib6_rules_cleanup();
5552         xfrm6_fini();
5553         fib6_gc_cleanup();
5554         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5555         unregister_pernet_subsys(&ip6_route_net_ops);
5556         dst_entries_destroy(&ip6_dst_blackhole_ops);
5557         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5558 }