]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
net: Add fib_nh_common and update fib_nh and fib6_nh
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
481
482                 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !rt->fib6_nh.fib_nh_has_gw)
537                 return;
538
539         nh_gw = &rt->fib6_nh.fib_nh_gw6;
540         dev = rt->fib6_nh.fib_nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.fib_nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !rt->fib6_nh.fib_nh_has_gw)
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev,
603                                           &rt->fib6_nh.fib_nh_gw6);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
643                                    int *mpri, struct fib6_info *match,
644                                    bool *do_rr)
645 {
646         int m;
647         bool match_do_rr = false;
648
649         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
650                 goto out;
651
652         if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) &&
653             rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
654             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
655                 goto out;
656
657         if (fib6_check_expired(rt))
658                 goto out;
659
660         m = rt6_score_route(rt, oif, strict);
661         if (m == RT6_NUD_FAIL_DO_RR) {
662                 match_do_rr = true;
663                 m = 0; /* lowest valid score */
664         } else if (m == RT6_NUD_FAIL_HARD) {
665                 goto out;
666         }
667
668         if (strict & RT6_LOOKUP_F_REACHABLE)
669                 rt6_probe(rt);
670
671         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
672         if (m > *mpri) {
673                 *do_rr = match_do_rr;
674                 *mpri = m;
675                 match = rt;
676         }
677 out:
678         return match;
679 }
680
681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
682                                      struct fib6_info *leaf,
683                                      struct fib6_info *rr_head,
684                                      u32 metric, int oif, int strict,
685                                      bool *do_rr)
686 {
687         struct fib6_info *rt, *match, *cont;
688         int mpri = -1;
689
690         match = NULL;
691         cont = NULL;
692         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
693                 if (rt->fib6_metric != metric) {
694                         cont = rt;
695                         break;
696                 }
697
698                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
699         }
700
701         for (rt = leaf; rt && rt != rr_head;
702              rt = rcu_dereference(rt->fib6_next)) {
703                 if (rt->fib6_metric != metric) {
704                         cont = rt;
705                         break;
706                 }
707
708                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
709         }
710
711         if (match || !cont)
712                 return match;
713
714         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716
717         return match;
718 }
719
720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
721                                    int oif, int strict)
722 {
723         struct fib6_info *leaf = rcu_dereference(fn->leaf);
724         struct fib6_info *match, *rt0;
725         bool do_rr = false;
726         int key_plen;
727
728         if (!leaf || leaf == net->ipv6.fib6_null_entry)
729                 return net->ipv6.fib6_null_entry;
730
731         rt0 = rcu_dereference(fn->rr_ptr);
732         if (!rt0)
733                 rt0 = leaf;
734
735         /* Double check to make sure fn is not an intermediate node
736          * and fn->leaf does not points to its child's leaf
737          * (This might happen if all routes under fn are deleted from
738          * the tree and fib6_repair_tree() is called on the node.)
739          */
740         key_plen = rt0->fib6_dst.plen;
741 #ifdef CONFIG_IPV6_SUBTREES
742         if (rt0->fib6_src.plen)
743                 key_plen = rt0->fib6_src.plen;
744 #endif
745         if (fn->fn_bit != key_plen)
746                 return net->ipv6.fib6_null_entry;
747
748         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
749                              &do_rr);
750
751         if (do_rr) {
752                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
753
754                 /* no entries matched; do round-robin */
755                 if (!next || next->fib6_metric != rt0->fib6_metric)
756                         next = leaf;
757
758                 if (next != rt0) {
759                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
760                         /* make sure next is not being deleted from the tree */
761                         if (next->fib6_node)
762                                 rcu_assign_pointer(fn->rr_ptr, next);
763                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
764                 }
765         }
766
767         return match ? match : net->ipv6.fib6_null_entry;
768 }
769
770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
771 {
772         return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw;
773 }
774
775 #ifdef CONFIG_IPV6_ROUTE_INFO
776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777                   const struct in6_addr *gwaddr)
778 {
779         struct net *net = dev_net(dev);
780         struct route_info *rinfo = (struct route_info *) opt;
781         struct in6_addr prefix_buf, *prefix;
782         unsigned int pref;
783         unsigned long lifetime;
784         struct fib6_info *rt;
785
786         if (len < sizeof(struct route_info)) {
787                 return -EINVAL;
788         }
789
790         /* Sanity check for prefix_len and length */
791         if (rinfo->length > 3) {
792                 return -EINVAL;
793         } else if (rinfo->prefix_len > 128) {
794                 return -EINVAL;
795         } else if (rinfo->prefix_len > 64) {
796                 if (rinfo->length < 2) {
797                         return -EINVAL;
798                 }
799         } else if (rinfo->prefix_len > 0) {
800                 if (rinfo->length < 1) {
801                         return -EINVAL;
802                 }
803         }
804
805         pref = rinfo->route_pref;
806         if (pref == ICMPV6_ROUTER_PREF_INVALID)
807                 return -EINVAL;
808
809         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
810
811         if (rinfo->length == 3)
812                 prefix = (struct in6_addr *)rinfo->prefix;
813         else {
814                 /* this function is safe */
815                 ipv6_addr_prefix(&prefix_buf,
816                                  (struct in6_addr *)rinfo->prefix,
817                                  rinfo->prefix_len);
818                 prefix = &prefix_buf;
819         }
820
821         if (rinfo->prefix_len == 0)
822                 rt = rt6_get_dflt_router(net, gwaddr, dev);
823         else
824                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
825                                         gwaddr, dev);
826
827         if (rt && !lifetime) {
828                 ip6_del_rt(net, rt);
829                 rt = NULL;
830         }
831
832         if (!rt && lifetime)
833                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
834                                         dev, pref);
835         else if (rt)
836                 rt->fib6_flags = RTF_ROUTEINFO |
837                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
838
839         if (rt) {
840                 if (!addrconf_finite_timeout(lifetime))
841                         fib6_clean_expires(rt);
842                 else
843                         fib6_set_expires(rt, jiffies + HZ * lifetime);
844
845                 fib6_info_release(rt);
846         }
847         return 0;
848 }
849 #endif
850
851 /*
852  *      Misc support functions
853  */
854
855 /* called with rcu_lock held */
856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
857 {
858         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
859
860         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861                 /* for copies of local routes, dst->dev needs to be the
862                  * device if it is a master device, the master device if
863                  * device is enslaved, and the loopback as the default
864                  */
865                 if (netif_is_l3_slave(dev) &&
866                     !rt6_need_strict(&rt->fib6_dst.addr))
867                         dev = l3mdev_master_dev_rcu(dev);
868                 else if (!netif_is_l3_master(dev))
869                         dev = dev_net(dev)->loopback_dev;
870                 /* last case is netif_is_l3_master(dev) is true in which
871                  * case we want dev returned to be dev
872                  */
873         }
874
875         return dev;
876 }
877
878 static const int fib6_prop[RTN_MAX + 1] = {
879         [RTN_UNSPEC]    = 0,
880         [RTN_UNICAST]   = 0,
881         [RTN_LOCAL]     = 0,
882         [RTN_BROADCAST] = 0,
883         [RTN_ANYCAST]   = 0,
884         [RTN_MULTICAST] = 0,
885         [RTN_BLACKHOLE] = -EINVAL,
886         [RTN_UNREACHABLE] = -EHOSTUNREACH,
887         [RTN_PROHIBIT]  = -EACCES,
888         [RTN_THROW]     = -EAGAIN,
889         [RTN_NAT]       = -EINVAL,
890         [RTN_XRESOLVE]  = -EINVAL,
891 };
892
893 static int ip6_rt_type_to_error(u8 fib6_type)
894 {
895         return fib6_prop[fib6_type];
896 }
897
898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
899 {
900         unsigned short flags = 0;
901
902         if (rt->dst_nocount)
903                 flags |= DST_NOCOUNT;
904         if (rt->dst_nopolicy)
905                 flags |= DST_NOPOLICY;
906         if (rt->dst_host)
907                 flags |= DST_HOST;
908
909         return flags;
910 }
911
912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
913 {
914         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
915
916         switch (ort->fib6_type) {
917         case RTN_BLACKHOLE:
918                 rt->dst.output = dst_discard_out;
919                 rt->dst.input = dst_discard;
920                 break;
921         case RTN_PROHIBIT:
922                 rt->dst.output = ip6_pkt_prohibit_out;
923                 rt->dst.input = ip6_pkt_prohibit;
924                 break;
925         case RTN_THROW:
926         case RTN_UNREACHABLE:
927         default:
928                 rt->dst.output = ip6_pkt_discard_out;
929                 rt->dst.input = ip6_pkt_discard;
930                 break;
931         }
932 }
933
934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
935 {
936         if (ort->fib6_flags & RTF_REJECT) {
937                 ip6_rt_init_dst_reject(rt, ort);
938                 return;
939         }
940
941         rt->dst.error = 0;
942         rt->dst.output = ip6_output;
943
944         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
945                 rt->dst.input = ip6_input;
946         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
947                 rt->dst.input = ip6_mc_input;
948         } else {
949                 rt->dst.input = ip6_forward;
950         }
951
952         if (ort->fib6_nh.fib_nh_lws) {
953                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
954                 lwtunnel_set_redirect(&rt->dst);
955         }
956
957         rt->dst.lastuse = jiffies;
958 }
959
960 /* Caller must already hold reference to @from */
961 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
962 {
963         rt->rt6i_flags &= ~RTF_EXPIRES;
964         rcu_assign_pointer(rt->from, from);
965         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
966 }
967
968 /* Caller must already hold reference to @ort */
969 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
970 {
971         struct net_device *dev = fib6_info_nh_dev(ort);
972
973         ip6_rt_init_dst(rt, ort);
974
975         rt->rt6i_dst = ort->fib6_dst;
976         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
977         rt->rt6i_flags = ort->fib6_flags;
978         if (ort->fib6_nh.fib_nh_has_gw) {
979                 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
980                 rt->rt6i_flags |= RTF_GATEWAY;
981         }
982         rt6_set_from(rt, ort);
983 #ifdef CONFIG_IPV6_SUBTREES
984         rt->rt6i_src = ort->fib6_src;
985 #endif
986 }
987
988 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
989                                         struct in6_addr *saddr)
990 {
991         struct fib6_node *pn, *sn;
992         while (1) {
993                 if (fn->fn_flags & RTN_TL_ROOT)
994                         return NULL;
995                 pn = rcu_dereference(fn->parent);
996                 sn = FIB6_SUBTREE(pn);
997                 if (sn && sn != fn)
998                         fn = fib6_node_lookup(sn, NULL, saddr);
999                 else
1000                         fn = pn;
1001                 if (fn->fn_flags & RTN_RTINFO)
1002                         return fn;
1003         }
1004 }
1005
1006 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1007 {
1008         struct rt6_info *rt = *prt;
1009
1010         if (dst_hold_safe(&rt->dst))
1011                 return true;
1012         if (net) {
1013                 rt = net->ipv6.ip6_null_entry;
1014                 dst_hold(&rt->dst);
1015         } else {
1016                 rt = NULL;
1017         }
1018         *prt = rt;
1019         return false;
1020 }
1021
1022 /* called with rcu_lock held */
1023 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1024 {
1025         unsigned short flags = fib6_info_dst_flags(rt);
1026         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1027         struct rt6_info *nrt;
1028
1029         if (!fib6_info_hold_safe(rt))
1030                 goto fallback;
1031
1032         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1033         if (!nrt) {
1034                 fib6_info_release(rt);
1035                 goto fallback;
1036         }
1037
1038         ip6_rt_copy_init(nrt, rt);
1039         return nrt;
1040
1041 fallback:
1042         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1043         dst_hold(&nrt->dst);
1044         return nrt;
1045 }
1046
1047 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1048                                              struct fib6_table *table,
1049                                              struct flowi6 *fl6,
1050                                              const struct sk_buff *skb,
1051                                              int flags)
1052 {
1053         struct fib6_info *f6i;
1054         struct fib6_node *fn;
1055         struct rt6_info *rt;
1056
1057         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1058                 flags &= ~RT6_LOOKUP_F_IFACE;
1059
1060         rcu_read_lock();
1061         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1062 restart:
1063         f6i = rcu_dereference(fn->leaf);
1064         if (!f6i) {
1065                 f6i = net->ipv6.fib6_null_entry;
1066         } else {
1067                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1068                                       fl6->flowi6_oif, flags);
1069                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1070                         f6i = fib6_multipath_select(net, f6i, fl6,
1071                                                     fl6->flowi6_oif, skb,
1072                                                     flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         trace_fib6_table_lookup(net, f6i, table, fl6);
1081
1082         /* Search through exception table */
1083         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1084         if (rt) {
1085                 if (ip6_hold_safe(net, &rt))
1086                         dst_use_noref(&rt->dst, jiffies);
1087         } else if (f6i == net->ipv6.fib6_null_entry) {
1088                 rt = net->ipv6.ip6_null_entry;
1089                 dst_hold(&rt->dst);
1090         } else {
1091                 rt = ip6_create_rt_rcu(f6i);
1092         }
1093
1094         rcu_read_unlock();
1095
1096         return rt;
1097 }
1098
1099 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1100                                    const struct sk_buff *skb, int flags)
1101 {
1102         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1105
1106 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1107                             const struct in6_addr *saddr, int oif,
1108                             const struct sk_buff *skb, int strict)
1109 {
1110         struct flowi6 fl6 = {
1111                 .flowi6_oif = oif,
1112                 .daddr = *daddr,
1113         };
1114         struct dst_entry *dst;
1115         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1116
1117         if (saddr) {
1118                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1119                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1120         }
1121
1122         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1123         if (dst->error == 0)
1124                 return (struct rt6_info *) dst;
1125
1126         dst_release(dst);
1127
1128         return NULL;
1129 }
1130 EXPORT_SYMBOL(rt6_lookup);
1131
1132 /* ip6_ins_rt is called with FREE table->tb6_lock.
1133  * It takes new route entry, the addition fails by any reason the
1134  * route is released.
1135  * Caller must hold dst before calling it.
1136  */
1137
1138 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1139                         struct netlink_ext_ack *extack)
1140 {
1141         int err;
1142         struct fib6_table *table;
1143
1144         table = rt->fib6_table;
1145         spin_lock_bh(&table->tb6_lock);
1146         err = fib6_add(&table->tb6_root, rt, info, extack);
1147         spin_unlock_bh(&table->tb6_lock);
1148
1149         return err;
1150 }
1151
1152 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1153 {
1154         struct nl_info info = { .nl_net = net, };
1155
1156         return __ip6_ins_rt(rt, &info, NULL);
1157 }
1158
1159 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1160                                            const struct in6_addr *daddr,
1161                                            const struct in6_addr *saddr)
1162 {
1163         struct net_device *dev;
1164         struct rt6_info *rt;
1165
1166         /*
1167          *      Clone the route.
1168          */
1169
1170         if (!fib6_info_hold_safe(ort))
1171                 return NULL;
1172
1173         dev = ip6_rt_get_dev_rcu(ort);
1174         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1175         if (!rt) {
1176                 fib6_info_release(ort);
1177                 return NULL;
1178         }
1179
1180         ip6_rt_copy_init(rt, ort);
1181         rt->rt6i_flags |= RTF_CACHE;
1182         rt->dst.flags |= DST_HOST;
1183         rt->rt6i_dst.addr = *daddr;
1184         rt->rt6i_dst.plen = 128;
1185
1186         if (!rt6_is_gw_or_nonexthop(ort)) {
1187                 if (ort->fib6_dst.plen != 128 &&
1188                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189                         rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191                 if (rt->rt6i_src.plen && saddr) {
1192                         rt->rt6i_src.addr = *saddr;
1193                         rt->rt6i_src.plen = 128;
1194                 }
1195 #endif
1196         }
1197
1198         return rt;
1199 }
1200
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 {
1203         unsigned short flags = fib6_info_dst_flags(rt);
1204         struct net_device *dev;
1205         struct rt6_info *pcpu_rt;
1206
1207         if (!fib6_info_hold_safe(rt))
1208                 return NULL;
1209
1210         rcu_read_lock();
1211         dev = ip6_rt_get_dev_rcu(rt);
1212         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1213         rcu_read_unlock();
1214         if (!pcpu_rt) {
1215                 fib6_info_release(rt);
1216                 return NULL;
1217         }
1218         ip6_rt_copy_init(pcpu_rt, rt);
1219         pcpu_rt->rt6i_flags |= RTF_PCPU;
1220         return pcpu_rt;
1221 }
1222
1223 /* It should be called with rcu_read_lock() acquired */
1224 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1225 {
1226         struct rt6_info *pcpu_rt, **p;
1227
1228         p = this_cpu_ptr(rt->rt6i_pcpu);
1229         pcpu_rt = *p;
1230
1231         if (pcpu_rt)
1232                 ip6_hold_safe(NULL, &pcpu_rt);
1233
1234         return pcpu_rt;
1235 }
1236
1237 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1238                                             struct fib6_info *rt)
1239 {
1240         struct rt6_info *pcpu_rt, *prev, **p;
1241
1242         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1243         if (!pcpu_rt) {
1244                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1245                 return net->ipv6.ip6_null_entry;
1246         }
1247
1248         dst_hold(&pcpu_rt->dst);
1249         p = this_cpu_ptr(rt->rt6i_pcpu);
1250         prev = cmpxchg(p, NULL, pcpu_rt);
1251         BUG_ON(prev);
1252
1253         return pcpu_rt;
1254 }
1255
1256 /* exception hash table implementation
1257  */
1258 static DEFINE_SPINLOCK(rt6_exception_lock);
1259
1260 /* Remove rt6_ex from hash table and free the memory
1261  * Caller must hold rt6_exception_lock
1262  */
1263 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1264                                  struct rt6_exception *rt6_ex)
1265 {
1266         struct fib6_info *from;
1267         struct net *net;
1268
1269         if (!bucket || !rt6_ex)
1270                 return;
1271
1272         net = dev_net(rt6_ex->rt6i->dst.dev);
1273         net->ipv6.rt6_stats->fib_rt_cache--;
1274
1275         /* purge completely the exception to allow releasing the held resources:
1276          * some [sk] cache may keep the dst around for unlimited time
1277          */
1278         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1279                                          lockdep_is_held(&rt6_exception_lock));
1280         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1281         fib6_info_release(from);
1282         dst_dev_put(&rt6_ex->rt6i->dst);
1283
1284         hlist_del_rcu(&rt6_ex->hlist);
1285         dst_release(&rt6_ex->rt6i->dst);
1286         kfree_rcu(rt6_ex, rcu);
1287         WARN_ON_ONCE(!bucket->depth);
1288         bucket->depth--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct in6_addr *src_key = NULL;
1603         struct rt6_exception *rt6_ex;
1604         struct fib6_info *from;
1605
1606         rcu_read_lock();
1607         from = rcu_dereference(rt->from);
1608         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1609                 goto unlock;
1610
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629 unlock:
1630         rcu_read_unlock();
1631 }
1632
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634                                          struct rt6_info *rt, int mtu)
1635 {
1636         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637          * lowest MTU in the path: always allow updating the route PMTU to
1638          * reflect PMTU decreases.
1639          *
1640          * If the new MTU is higher, and the route PMTU is equal to the local
1641          * MTU, this means the old MTU is the lowest in the path, so allow
1642          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1643          * handle this.
1644          */
1645
1646         if (dst_mtu(&rt->dst) >= mtu)
1647                 return true;
1648
1649         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1650                 return true;
1651
1652         return false;
1653 }
1654
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656                                        struct fib6_info *rt, int mtu)
1657 {
1658         struct rt6_exception_bucket *bucket;
1659         struct rt6_exception *rt6_ex;
1660         int i;
1661
1662         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663                                         lockdep_is_held(&rt6_exception_lock));
1664
1665         if (!bucket)
1666                 return;
1667
1668         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670                         struct rt6_info *entry = rt6_ex->rt6i;
1671
1672                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673                          * route), the metrics of its rt->from have already
1674                          * been updated.
1675                          */
1676                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1678                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1679                 }
1680                 bucket++;
1681         }
1682 }
1683
1684 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1685
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687                                         struct in6_addr *gateway)
1688 {
1689         struct rt6_exception_bucket *bucket;
1690         struct rt6_exception *rt6_ex;
1691         struct hlist_node *tmp;
1692         int i;
1693
1694         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1695                 return;
1696
1697         spin_lock_bh(&rt6_exception_lock);
1698         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699                                      lockdep_is_held(&rt6_exception_lock));
1700
1701         if (bucket) {
1702                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703                         hlist_for_each_entry_safe(rt6_ex, tmp,
1704                                                   &bucket->chain, hlist) {
1705                                 struct rt6_info *entry = rt6_ex->rt6i;
1706
1707                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708                                     RTF_CACHE_GATEWAY &&
1709                                     ipv6_addr_equal(gateway,
1710                                                     &entry->rt6i_gateway)) {
1711                                         rt6_remove_exception(bucket, rt6_ex);
1712                                 }
1713                         }
1714                         bucket++;
1715                 }
1716         }
1717
1718         spin_unlock_bh(&rt6_exception_lock);
1719 }
1720
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722                                       struct rt6_exception *rt6_ex,
1723                                       struct fib6_gc_args *gc_args,
1724                                       unsigned long now)
1725 {
1726         struct rt6_info *rt = rt6_ex->rt6i;
1727
1728         /* we are pruning and obsoleting aged-out and non gateway exceptions
1729          * even if others have still references to them, so that on next
1730          * dst_check() such references can be dropped.
1731          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732          * expired, independently from their aging, as per RFC 8201 section 4
1733          */
1734         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736                         RT6_TRACE("aging clone %p\n", rt);
1737                         rt6_remove_exception(bucket, rt6_ex);
1738                         return;
1739                 }
1740         } else if (time_after(jiffies, rt->dst.expires)) {
1741                 RT6_TRACE("purging expired route %p\n", rt);
1742                 rt6_remove_exception(bucket, rt6_ex);
1743                 return;
1744         }
1745
1746         if (rt->rt6i_flags & RTF_GATEWAY) {
1747                 struct neighbour *neigh;
1748                 __u8 neigh_flags = 0;
1749
1750                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1751                 if (neigh)
1752                         neigh_flags = neigh->flags;
1753
1754                 if (!(neigh_flags & NTF_ROUTER)) {
1755                         RT6_TRACE("purging route %p via non-router but gateway\n",
1756                                   rt);
1757                         rt6_remove_exception(bucket, rt6_ex);
1758                         return;
1759                 }
1760         }
1761
1762         gc_args->more++;
1763 }
1764
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766                         struct fib6_gc_args *gc_args,
1767                         unsigned long now)
1768 {
1769         struct rt6_exception_bucket *bucket;
1770         struct rt6_exception *rt6_ex;
1771         struct hlist_node *tmp;
1772         int i;
1773
1774         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1775                 return;
1776
1777         rcu_read_lock_bh();
1778         spin_lock(&rt6_exception_lock);
1779         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780                                     lockdep_is_held(&rt6_exception_lock));
1781
1782         if (bucket) {
1783                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784                         hlist_for_each_entry_safe(rt6_ex, tmp,
1785                                                   &bucket->chain, hlist) {
1786                                 rt6_age_examine_exception(bucket, rt6_ex,
1787                                                           gc_args, now);
1788                         }
1789                         bucket++;
1790                 }
1791         }
1792         spin_unlock(&rt6_exception_lock);
1793         rcu_read_unlock_bh();
1794 }
1795
1796 /* must be called with rcu lock held */
1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1798                                     int oif, struct flowi6 *fl6, int strict)
1799 {
1800         struct fib6_node *fn, *saved_fn;
1801         struct fib6_info *f6i;
1802
1803         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1804         saved_fn = fn;
1805
1806         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1807                 oif = 0;
1808
1809 redo_rt6_select:
1810         f6i = rt6_select(net, fn, oif, strict);
1811         if (f6i == net->ipv6.fib6_null_entry) {
1812                 fn = fib6_backtrack(fn, &fl6->saddr);
1813                 if (fn)
1814                         goto redo_rt6_select;
1815                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1816                         /* also consider unreachable route */
1817                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1818                         fn = saved_fn;
1819                         goto redo_rt6_select;
1820                 }
1821         }
1822
1823         trace_fib6_table_lookup(net, f6i, table, fl6);
1824
1825         return f6i;
1826 }
1827
1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1829                                int oif, struct flowi6 *fl6,
1830                                const struct sk_buff *skb, int flags)
1831 {
1832         struct fib6_info *f6i;
1833         struct rt6_info *rt;
1834         int strict = 0;
1835
1836         strict |= flags & RT6_LOOKUP_F_IFACE;
1837         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1838         if (net->ipv6.devconf_all->forwarding == 0)
1839                 strict |= RT6_LOOKUP_F_REACHABLE;
1840
1841         rcu_read_lock();
1842
1843         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1844         if (f6i->fib6_nsiblings)
1845                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1846
1847         if (f6i == net->ipv6.fib6_null_entry) {
1848                 rt = net->ipv6.ip6_null_entry;
1849                 rcu_read_unlock();
1850                 dst_hold(&rt->dst);
1851                 return rt;
1852         }
1853
1854         /*Search through exception table */
1855         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1856         if (rt) {
1857                 if (ip6_hold_safe(net, &rt))
1858                         dst_use_noref(&rt->dst, jiffies);
1859
1860                 rcu_read_unlock();
1861                 return rt;
1862         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863                             !f6i->fib6_nh.fib_nh_has_gw)) {
1864                 /* Create a RTF_CACHE clone which will not be
1865                  * owned by the fib6 tree.  It is for the special case where
1866                  * the daddr in the skb during the neighbor look-up is different
1867                  * from the fl6->daddr used to look-up route here.
1868                  */
1869                 struct rt6_info *uncached_rt;
1870
1871                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1872
1873                 rcu_read_unlock();
1874
1875                 if (uncached_rt) {
1876                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1877                          * No need for another dst_hold()
1878                          */
1879                         rt6_uncached_list_add(uncached_rt);
1880                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1881                 } else {
1882                         uncached_rt = net->ipv6.ip6_null_entry;
1883                         dst_hold(&uncached_rt->dst);
1884                 }
1885
1886                 return uncached_rt;
1887         } else {
1888                 /* Get a percpu copy */
1889
1890                 struct rt6_info *pcpu_rt;
1891
1892                 local_bh_disable();
1893                 pcpu_rt = rt6_get_pcpu_route(f6i);
1894
1895                 if (!pcpu_rt)
1896                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1897
1898                 local_bh_enable();
1899                 rcu_read_unlock();
1900
1901                 return pcpu_rt;
1902         }
1903 }
1904 EXPORT_SYMBOL_GPL(ip6_pol_route);
1905
1906 static struct rt6_info *ip6_pol_route_input(struct net *net,
1907                                             struct fib6_table *table,
1908                                             struct flowi6 *fl6,
1909                                             const struct sk_buff *skb,
1910                                             int flags)
1911 {
1912         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1913 }
1914
1915 struct dst_entry *ip6_route_input_lookup(struct net *net,
1916                                          struct net_device *dev,
1917                                          struct flowi6 *fl6,
1918                                          const struct sk_buff *skb,
1919                                          int flags)
1920 {
1921         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1922                 flags |= RT6_LOOKUP_F_IFACE;
1923
1924         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1927
1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929                                   struct flow_keys *keys,
1930                                   struct flow_keys *flkeys)
1931 {
1932         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1933         const struct ipv6hdr *key_iph = outer_iph;
1934         struct flow_keys *_flkeys = flkeys;
1935         const struct ipv6hdr *inner_iph;
1936         const struct icmp6hdr *icmph;
1937         struct ipv6hdr _inner_iph;
1938         struct icmp6hdr _icmph;
1939
1940         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1941                 goto out;
1942
1943         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1944                                    sizeof(_icmph), &_icmph);
1945         if (!icmph)
1946                 goto out;
1947
1948         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1949             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1950             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1951             icmph->icmp6_type != ICMPV6_PARAMPROB)
1952                 goto out;
1953
1954         inner_iph = skb_header_pointer(skb,
1955                                        skb_transport_offset(skb) + sizeof(*icmph),
1956                                        sizeof(_inner_iph), &_inner_iph);
1957         if (!inner_iph)
1958                 goto out;
1959
1960         key_iph = inner_iph;
1961         _flkeys = NULL;
1962 out:
1963         if (_flkeys) {
1964                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1965                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1966                 keys->tags.flow_label = _flkeys->tags.flow_label;
1967                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1968         } else {
1969                 keys->addrs.v6addrs.src = key_iph->saddr;
1970                 keys->addrs.v6addrs.dst = key_iph->daddr;
1971                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1972                 keys->basic.ip_proto = key_iph->nexthdr;
1973         }
1974 }
1975
1976 /* if skb is set it will be used and fl6 can be NULL */
1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1978                        const struct sk_buff *skb, struct flow_keys *flkeys)
1979 {
1980         struct flow_keys hash_keys;
1981         u32 mhash;
1982
1983         switch (ip6_multipath_hash_policy(net)) {
1984         case 0:
1985                 memset(&hash_keys, 0, sizeof(hash_keys));
1986                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987                 if (skb) {
1988                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1989                 } else {
1990                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1991                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1994                 }
1995                 break;
1996         case 1:
1997                 if (skb) {
1998                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1999                         struct flow_keys keys;
2000
2001                         /* short-circuit if we already have L4 hash present */
2002                         if (skb->l4_hash)
2003                                 return skb_get_hash_raw(skb) >> 1;
2004
2005                         memset(&hash_keys, 0, sizeof(hash_keys));
2006
2007                         if (!flkeys) {
2008                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2009                                 flkeys = &keys;
2010                         }
2011                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2013                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2014                         hash_keys.ports.src = flkeys->ports.src;
2015                         hash_keys.ports.dst = flkeys->ports.dst;
2016                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2017                 } else {
2018                         memset(&hash_keys, 0, sizeof(hash_keys));
2019                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2021                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2022                         hash_keys.ports.src = fl6->fl6_sport;
2023                         hash_keys.ports.dst = fl6->fl6_dport;
2024                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2025                 }
2026                 break;
2027         }
2028         mhash = flow_hash_from_keys(&hash_keys);
2029
2030         return mhash >> 1;
2031 }
2032
2033 void ip6_route_input(struct sk_buff *skb)
2034 {
2035         const struct ipv6hdr *iph = ipv6_hdr(skb);
2036         struct net *net = dev_net(skb->dev);
2037         int flags = RT6_LOOKUP_F_HAS_SADDR;
2038         struct ip_tunnel_info *tun_info;
2039         struct flowi6 fl6 = {
2040                 .flowi6_iif = skb->dev->ifindex,
2041                 .daddr = iph->daddr,
2042                 .saddr = iph->saddr,
2043                 .flowlabel = ip6_flowinfo(iph),
2044                 .flowi6_mark = skb->mark,
2045                 .flowi6_proto = iph->nexthdr,
2046         };
2047         struct flow_keys *flkeys = NULL, _flkeys;
2048
2049         tun_info = skb_tunnel_info(skb);
2050         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2052
2053         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2054                 flkeys = &_flkeys;
2055
2056         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2058         skb_dst_drop(skb);
2059         skb_dst_set(skb,
2060                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2061 }
2062
2063 static struct rt6_info *ip6_pol_route_output(struct net *net,
2064                                              struct fib6_table *table,
2065                                              struct flowi6 *fl6,
2066                                              const struct sk_buff *skb,
2067                                              int flags)
2068 {
2069         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2070 }
2071
2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2073                                          struct flowi6 *fl6, int flags)
2074 {
2075         bool any_src;
2076
2077         if (ipv6_addr_type(&fl6->daddr) &
2078             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079                 struct dst_entry *dst;
2080
2081                 dst = l3mdev_link_scope_lookup(net, fl6);
2082                 if (dst)
2083                         return dst;
2084         }
2085
2086         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2087
2088         any_src = ipv6_addr_any(&fl6->saddr);
2089         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090             (fl6->flowi6_oif && any_src))
2091                 flags |= RT6_LOOKUP_F_IFACE;
2092
2093         if (!any_src)
2094                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2095         else if (sk)
2096                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2097
2098         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2099 }
2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2101
2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2103 {
2104         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105         struct net_device *loopback_dev = net->loopback_dev;
2106         struct dst_entry *new = NULL;
2107
2108         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109                        DST_OBSOLETE_DEAD, 0);
2110         if (rt) {
2111                 rt6_info_init(rt);
2112                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2113
2114                 new = &rt->dst;
2115                 new->__use = 1;
2116                 new->input = dst_discard;
2117                 new->output = dst_discard_out;
2118
2119                 dst_copy_metrics(new, &ort->dst);
2120
2121                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2122                 rt->rt6i_gateway = ort->rt6i_gateway;
2123                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2124
2125                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2126 #ifdef CONFIG_IPV6_SUBTREES
2127                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2128 #endif
2129         }
2130
2131         dst_release(dst_orig);
2132         return new ? new : ERR_PTR(-ENOMEM);
2133 }
2134
2135 /*
2136  *      Destination cache support functions
2137  */
2138
2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2140 {
2141         u32 rt_cookie = 0;
2142
2143         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2144                 return false;
2145
2146         if (fib6_check_expired(f6i))
2147                 return false;
2148
2149         return true;
2150 }
2151
2152 static struct dst_entry *rt6_check(struct rt6_info *rt,
2153                                    struct fib6_info *from,
2154                                    u32 cookie)
2155 {
2156         u32 rt_cookie = 0;
2157
2158         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159             rt_cookie != cookie)
2160                 return NULL;
2161
2162         if (rt6_check_expired(rt))
2163                 return NULL;
2164
2165         return &rt->dst;
2166 }
2167
2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2169                                             struct fib6_info *from,
2170                                             u32 cookie)
2171 {
2172         if (!__rt6_check_expired(rt) &&
2173             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174             fib6_check(from, cookie))
2175                 return &rt->dst;
2176         else
2177                 return NULL;
2178 }
2179
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2181 {
2182         struct dst_entry *dst_ret;
2183         struct fib6_info *from;
2184         struct rt6_info *rt;
2185
2186         rt = container_of(dst, struct rt6_info, dst);
2187
2188         rcu_read_lock();
2189
2190         /* All IPV6 dsts are created with ->obsolete set to the value
2191          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2192          * into this function always.
2193          */
2194
2195         from = rcu_dereference(rt->from);
2196
2197         if (from && (rt->rt6i_flags & RTF_PCPU ||
2198             unlikely(!list_empty(&rt->rt6i_uncached))))
2199                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2200         else
2201                 dst_ret = rt6_check(rt, from, cookie);
2202
2203         rcu_read_unlock();
2204
2205         return dst_ret;
2206 }
2207
2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2209 {
2210         struct rt6_info *rt = (struct rt6_info *) dst;
2211
2212         if (rt) {
2213                 if (rt->rt6i_flags & RTF_CACHE) {
2214                         rcu_read_lock();
2215                         if (rt6_check_expired(rt)) {
2216                                 rt6_remove_exception_rt(rt);
2217                                 dst = NULL;
2218                         }
2219                         rcu_read_unlock();
2220                 } else {
2221                         dst_release(dst);
2222                         dst = NULL;
2223                 }
2224         }
2225         return dst;
2226 }
2227
2228 static void ip6_link_failure(struct sk_buff *skb)
2229 {
2230         struct rt6_info *rt;
2231
2232         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2233
2234         rt = (struct rt6_info *) skb_dst(skb);
2235         if (rt) {
2236                 rcu_read_lock();
2237                 if (rt->rt6i_flags & RTF_CACHE) {
2238                         rt6_remove_exception_rt(rt);
2239                 } else {
2240                         struct fib6_info *from;
2241                         struct fib6_node *fn;
2242
2243                         from = rcu_dereference(rt->from);
2244                         if (from) {
2245                                 fn = rcu_dereference(from->fib6_node);
2246                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2247                                         fn->fn_sernum = -1;
2248                         }
2249                 }
2250                 rcu_read_unlock();
2251         }
2252 }
2253
2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2255 {
2256         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2257                 struct fib6_info *from;
2258
2259                 rcu_read_lock();
2260                 from = rcu_dereference(rt0->from);
2261                 if (from)
2262                         rt0->dst.expires = from->expires;
2263                 rcu_read_unlock();
2264         }
2265
2266         dst_set_expires(&rt0->dst, timeout);
2267         rt0->rt6i_flags |= RTF_EXPIRES;
2268 }
2269
2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2271 {
2272         struct net *net = dev_net(rt->dst.dev);
2273
2274         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275         rt->rt6i_flags |= RTF_MODIFIED;
2276         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2277 }
2278
2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2280 {
2281         return !(rt->rt6i_flags & RTF_CACHE) &&
2282                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2283 }
2284
2285 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2286                                  const struct ipv6hdr *iph, u32 mtu)
2287 {
2288         const struct in6_addr *daddr, *saddr;
2289         struct rt6_info *rt6 = (struct rt6_info *)dst;
2290
2291         if (dst_metric_locked(dst, RTAX_MTU))
2292                 return;
2293
2294         if (iph) {
2295                 daddr = &iph->daddr;
2296                 saddr = &iph->saddr;
2297         } else if (sk) {
2298                 daddr = &sk->sk_v6_daddr;
2299                 saddr = &inet6_sk(sk)->saddr;
2300         } else {
2301                 daddr = NULL;
2302                 saddr = NULL;
2303         }
2304         dst_confirm_neigh(dst, daddr);
2305         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2306         if (mtu >= dst_mtu(dst))
2307                 return;
2308
2309         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310                 rt6_do_update_pmtu(rt6, mtu);
2311                 /* update rt6_ex->stamp for cache */
2312                 if (rt6->rt6i_flags & RTF_CACHE)
2313                         rt6_update_exception_stamp_rt(rt6);
2314         } else if (daddr) {
2315                 struct fib6_info *from;
2316                 struct rt6_info *nrt6;
2317
2318                 rcu_read_lock();
2319                 from = rcu_dereference(rt6->from);
2320                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2321                 if (nrt6) {
2322                         rt6_do_update_pmtu(nrt6, mtu);
2323                         if (rt6_insert_exception(nrt6, from))
2324                                 dst_release_immediate(&nrt6->dst);
2325                 }
2326                 rcu_read_unlock();
2327         }
2328 }
2329
2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2331                                struct sk_buff *skb, u32 mtu)
2332 {
2333         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2334 }
2335
2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337                      int oif, u32 mark, kuid_t uid)
2338 {
2339         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2340         struct dst_entry *dst;
2341         struct flowi6 fl6 = {
2342                 .flowi6_oif = oif,
2343                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2344                 .daddr = iph->daddr,
2345                 .saddr = iph->saddr,
2346                 .flowlabel = ip6_flowinfo(iph),
2347                 .flowi6_uid = uid,
2348         };
2349
2350         dst = ip6_route_output(net, NULL, &fl6);
2351         if (!dst->error)
2352                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2353         dst_release(dst);
2354 }
2355 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2356
2357 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2358 {
2359         int oif = sk->sk_bound_dev_if;
2360         struct dst_entry *dst;
2361
2362         if (!oif && skb->dev)
2363                 oif = l3mdev_master_ifindex(skb->dev);
2364
2365         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2366
2367         dst = __sk_dst_get(sk);
2368         if (!dst || !dst->obsolete ||
2369             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2370                 return;
2371
2372         bh_lock_sock(sk);
2373         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374                 ip6_datagram_dst_update(sk, false);
2375         bh_unlock_sock(sk);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2378
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380                            const struct flowi6 *fl6)
2381 {
2382 #ifdef CONFIG_IPV6_SUBTREES
2383         struct ipv6_pinfo *np = inet6_sk(sk);
2384 #endif
2385
2386         ip6_dst_store(sk, dst,
2387                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388                       &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2391                       &np->saddr :
2392 #endif
2393                       NULL);
2394 }
2395
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2398         struct flowi6 fl6;
2399         struct in6_addr gateway;
2400 };
2401
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403                                              struct fib6_table *table,
2404                                              struct flowi6 *fl6,
2405                                              const struct sk_buff *skb,
2406                                              int flags)
2407 {
2408         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409         struct rt6_info *ret = NULL, *rt_cache;
2410         struct fib6_info *rt;
2411         struct fib6_node *fn;
2412
2413         /* Get the "current" route for this destination and
2414          * check if the redirect has come from appropriate router.
2415          *
2416          * RFC 4861 specifies that redirects should only be
2417          * accepted if they come from the nexthop to the target.
2418          * Due to the way the routes are chosen, this notion
2419          * is a bit fuzzy and one might need to check all possible
2420          * routes.
2421          */
2422
2423         rcu_read_lock();
2424         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2425 restart:
2426         for_each_fib6_node_rt_rcu(fn) {
2427                 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
2428                         continue;
2429                 if (fib6_check_expired(rt))
2430                         continue;
2431                 if (rt->fib6_flags & RTF_REJECT)
2432                         break;
2433                 if (!rt->fib6_nh.fib_nh_has_gw)
2434                         continue;
2435                 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2436                         continue;
2437                 /* rt_cache's gateway might be different from its 'parent'
2438                  * in the case of an ip redirect.
2439                  * So we keep searching in the exception table if the gateway
2440                  * is different.
2441                  */
2442                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
2443                         rt_cache = rt6_find_cached_rt(rt,
2444                                                       &fl6->daddr,
2445                                                       &fl6->saddr);
2446                         if (rt_cache &&
2447                             ipv6_addr_equal(&rdfl->gateway,
2448                                             &rt_cache->rt6i_gateway)) {
2449                                 ret = rt_cache;
2450                                 break;
2451                         }
2452                         continue;
2453                 }
2454                 break;
2455         }
2456
2457         if (!rt)
2458                 rt = net->ipv6.fib6_null_entry;
2459         else if (rt->fib6_flags & RTF_REJECT) {
2460                 ret = net->ipv6.ip6_null_entry;
2461                 goto out;
2462         }
2463
2464         if (rt == net->ipv6.fib6_null_entry) {
2465                 fn = fib6_backtrack(fn, &fl6->saddr);
2466                 if (fn)
2467                         goto restart;
2468         }
2469
2470 out:
2471         if (ret)
2472                 ip6_hold_safe(net, &ret);
2473         else
2474                 ret = ip6_create_rt_rcu(rt);
2475
2476         rcu_read_unlock();
2477
2478         trace_fib6_table_lookup(net, rt, table, fl6);
2479         return ret;
2480 };
2481
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483                                             const struct flowi6 *fl6,
2484                                             const struct sk_buff *skb,
2485                                             const struct in6_addr *gateway)
2486 {
2487         int flags = RT6_LOOKUP_F_HAS_SADDR;
2488         struct ip6rd_flowi rdfl;
2489
2490         rdfl.fl6 = *fl6;
2491         rdfl.gateway = *gateway;
2492
2493         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494                                 flags, __ip6_route_redirect);
2495 }
2496
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2498                   kuid_t uid)
2499 {
2500         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501         struct dst_entry *dst;
2502         struct flowi6 fl6 = {
2503                 .flowi6_iif = LOOPBACK_IFINDEX,
2504                 .flowi6_oif = oif,
2505                 .flowi6_mark = mark,
2506                 .daddr = iph->daddr,
2507                 .saddr = iph->saddr,
2508                 .flowlabel = ip6_flowinfo(iph),
2509                 .flowi6_uid = uid,
2510         };
2511
2512         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2513         rt6_do_redirect(dst, NULL, skb);
2514         dst_release(dst);
2515 }
2516 EXPORT_SYMBOL_GPL(ip6_redirect);
2517
2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2519 {
2520         const struct ipv6hdr *iph = ipv6_hdr(skb);
2521         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2522         struct dst_entry *dst;
2523         struct flowi6 fl6 = {
2524                 .flowi6_iif = LOOPBACK_IFINDEX,
2525                 .flowi6_oif = oif,
2526                 .daddr = msg->dest,
2527                 .saddr = iph->daddr,
2528                 .flowi6_uid = sock_net_uid(net, NULL),
2529         };
2530
2531         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2532         rt6_do_redirect(dst, NULL, skb);
2533         dst_release(dst);
2534 }
2535
2536 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2537 {
2538         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2539                      sk->sk_uid);
2540 }
2541 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2542
2543 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2544 {
2545         struct net_device *dev = dst->dev;
2546         unsigned int mtu = dst_mtu(dst);
2547         struct net *net = dev_net(dev);
2548
2549         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2550
2551         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2552                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2553
2554         /*
2555          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2556          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2557          * IPV6_MAXPLEN is also valid and means: "any MSS,
2558          * rely only on pmtu discovery"
2559          */
2560         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2561                 mtu = IPV6_MAXPLEN;
2562         return mtu;
2563 }
2564
2565 static unsigned int ip6_mtu(const struct dst_entry *dst)
2566 {
2567         struct inet6_dev *idev;
2568         unsigned int mtu;
2569
2570         mtu = dst_metric_raw(dst, RTAX_MTU);
2571         if (mtu)
2572                 goto out;
2573
2574         mtu = IPV6_MIN_MTU;
2575
2576         rcu_read_lock();
2577         idev = __in6_dev_get(dst->dev);
2578         if (idev)
2579                 mtu = idev->cnf.mtu6;
2580         rcu_read_unlock();
2581
2582 out:
2583         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2584
2585         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2586 }
2587
2588 /* MTU selection:
2589  * 1. mtu on route is locked - use it
2590  * 2. mtu from nexthop exception
2591  * 3. mtu from egress device
2592  *
2593  * based on ip6_dst_mtu_forward and exception logic of
2594  * rt6_find_cached_rt; called with rcu_read_lock
2595  */
2596 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2597                       struct in6_addr *saddr)
2598 {
2599         struct rt6_exception_bucket *bucket;
2600         struct rt6_exception *rt6_ex;
2601         struct in6_addr *src_key;
2602         struct inet6_dev *idev;
2603         u32 mtu = 0;
2604
2605         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2606                 mtu = f6i->fib6_pmtu;
2607                 if (mtu)
2608                         goto out;
2609         }
2610
2611         src_key = NULL;
2612 #ifdef CONFIG_IPV6_SUBTREES
2613         if (f6i->fib6_src.plen)
2614                 src_key = saddr;
2615 #endif
2616
2617         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2618         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2619         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2620                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2621
2622         if (likely(!mtu)) {
2623                 struct net_device *dev = fib6_info_nh_dev(f6i);
2624
2625                 mtu = IPV6_MIN_MTU;
2626                 idev = __in6_dev_get(dev);
2627                 if (idev && idev->cnf.mtu6 > mtu)
2628                         mtu = idev->cnf.mtu6;
2629         }
2630
2631         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2632 out:
2633         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2634 }
2635
2636 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2637                                   struct flowi6 *fl6)
2638 {
2639         struct dst_entry *dst;
2640         struct rt6_info *rt;
2641         struct inet6_dev *idev = in6_dev_get(dev);
2642         struct net *net = dev_net(dev);
2643
2644         if (unlikely(!idev))
2645                 return ERR_PTR(-ENODEV);
2646
2647         rt = ip6_dst_alloc(net, dev, 0);
2648         if (unlikely(!rt)) {
2649                 in6_dev_put(idev);
2650                 dst = ERR_PTR(-ENOMEM);
2651                 goto out;
2652         }
2653
2654         rt->dst.flags |= DST_HOST;
2655         rt->dst.input = ip6_input;
2656         rt->dst.output  = ip6_output;
2657         rt->rt6i_gateway  = fl6->daddr;
2658         rt->rt6i_dst.addr = fl6->daddr;
2659         rt->rt6i_dst.plen = 128;
2660         rt->rt6i_idev     = idev;
2661         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2662
2663         /* Add this dst into uncached_list so that rt6_disable_ip() can
2664          * do proper release of the net_device
2665          */
2666         rt6_uncached_list_add(rt);
2667         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2668
2669         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2670
2671 out:
2672         return dst;
2673 }
2674
2675 static int ip6_dst_gc(struct dst_ops *ops)
2676 {
2677         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2678         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2679         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2680         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2681         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2682         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2683         int entries;
2684
2685         entries = dst_entries_get_fast(ops);
2686         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2687             entries <= rt_max_size)
2688                 goto out;
2689
2690         net->ipv6.ip6_rt_gc_expire++;
2691         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2692         entries = dst_entries_get_slow(ops);
2693         if (entries < ops->gc_thresh)
2694                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2695 out:
2696         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2697         return entries > rt_max_size;
2698 }
2699
2700 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2701                                             struct fib6_config *cfg,
2702                                             const struct in6_addr *gw_addr,
2703                                             u32 tbid, int flags)
2704 {
2705         struct flowi6 fl6 = {
2706                 .flowi6_oif = cfg->fc_ifindex,
2707                 .daddr = *gw_addr,
2708                 .saddr = cfg->fc_prefsrc,
2709         };
2710         struct fib6_table *table;
2711         struct rt6_info *rt;
2712
2713         table = fib6_get_table(net, tbid);
2714         if (!table)
2715                 return NULL;
2716
2717         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2718                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2719
2720         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2721         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2722
2723         /* if table lookup failed, fall back to full lookup */
2724         if (rt == net->ipv6.ip6_null_entry) {
2725                 ip6_rt_put(rt);
2726                 rt = NULL;
2727         }
2728
2729         return rt;
2730 }
2731
2732 static int ip6_route_check_nh_onlink(struct net *net,
2733                                      struct fib6_config *cfg,
2734                                      const struct net_device *dev,
2735                                      struct netlink_ext_ack *extack)
2736 {
2737         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2738         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2740         struct fib6_info *from;
2741         struct rt6_info *grt;
2742         int err;
2743
2744         err = 0;
2745         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2746         if (grt) {
2747                 rcu_read_lock();
2748                 from = rcu_dereference(grt->from);
2749                 if (!grt->dst.error &&
2750                     /* ignore match if it is the default route */
2751                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2752                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753                         NL_SET_ERR_MSG(extack,
2754                                        "Nexthop has invalid gateway or device mismatch");
2755                         err = -EINVAL;
2756                 }
2757                 rcu_read_unlock();
2758
2759                 ip6_rt_put(grt);
2760         }
2761
2762         return err;
2763 }
2764
2765 static int ip6_route_check_nh(struct net *net,
2766                               struct fib6_config *cfg,
2767                               struct net_device **_dev,
2768                               struct inet6_dev **idev)
2769 {
2770         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2771         struct net_device *dev = _dev ? *_dev : NULL;
2772         struct rt6_info *grt = NULL;
2773         int err = -EHOSTUNREACH;
2774
2775         if (cfg->fc_table) {
2776                 int flags = RT6_LOOKUP_F_IFACE;
2777
2778                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2779                                           cfg->fc_table, flags);
2780                 if (grt) {
2781                         if (grt->rt6i_flags & RTF_GATEWAY ||
2782                             (dev && dev != grt->dst.dev)) {
2783                                 ip6_rt_put(grt);
2784                                 grt = NULL;
2785                         }
2786                 }
2787         }
2788
2789         if (!grt)
2790                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2791
2792         if (!grt)
2793                 goto out;
2794
2795         if (dev) {
2796                 if (dev != grt->dst.dev) {
2797                         ip6_rt_put(grt);
2798                         goto out;
2799                 }
2800         } else {
2801                 *_dev = dev = grt->dst.dev;
2802                 *idev = grt->rt6i_idev;
2803                 dev_hold(dev);
2804                 in6_dev_hold(grt->rt6i_idev);
2805         }
2806
2807         if (!(grt->rt6i_flags & RTF_GATEWAY))
2808                 err = 0;
2809
2810         ip6_rt_put(grt);
2811
2812 out:
2813         return err;
2814 }
2815
2816 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2817                            struct net_device **_dev, struct inet6_dev **idev,
2818                            struct netlink_ext_ack *extack)
2819 {
2820         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2821         int gwa_type = ipv6_addr_type(gw_addr);
2822         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2823         const struct net_device *dev = *_dev;
2824         bool need_addr_check = !dev;
2825         int err = -EINVAL;
2826
2827         /* if gw_addr is local we will fail to detect this in case
2828          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2829          * will return already-added prefix route via interface that
2830          * prefix route was assigned to, which might be non-loopback.
2831          */
2832         if (dev &&
2833             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2834                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2835                 goto out;
2836         }
2837
2838         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2839                 /* IPv6 strictly inhibits using not link-local
2840                  * addresses as nexthop address.
2841                  * Otherwise, router will not able to send redirects.
2842                  * It is very good, but in some (rare!) circumstances
2843                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2844                  * some exceptions. --ANK
2845                  * We allow IPv4-mapped nexthops to support RFC4798-type
2846                  * addressing
2847                  */
2848                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2849                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2850                         goto out;
2851                 }
2852
2853                 if (cfg->fc_flags & RTNH_F_ONLINK)
2854                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2855                 else
2856                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2857
2858                 if (err)
2859                         goto out;
2860         }
2861
2862         /* reload in case device was changed */
2863         dev = *_dev;
2864
2865         err = -EINVAL;
2866         if (!dev) {
2867                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2868                 goto out;
2869         } else if (dev->flags & IFF_LOOPBACK) {
2870                 NL_SET_ERR_MSG(extack,
2871                                "Egress device can not be loopback device for this route");
2872                 goto out;
2873         }
2874
2875         /* if we did not check gw_addr above, do so now that the
2876          * egress device has been resolved.
2877          */
2878         if (need_addr_check &&
2879             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2880                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2881                 goto out;
2882         }
2883
2884         err = 0;
2885 out:
2886         return err;
2887 }
2888
2889 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2890 {
2891         if ((flags & RTF_REJECT) ||
2892             (dev && (dev->flags & IFF_LOOPBACK) &&
2893              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2894              !(flags & RTF_LOCAL)))
2895                 return true;
2896
2897         return false;
2898 }
2899
2900 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2901                  struct fib6_config *cfg, gfp_t gfp_flags,
2902                  struct netlink_ext_ack *extack)
2903 {
2904         struct net_device *dev = NULL;
2905         struct inet6_dev *idev = NULL;
2906         int addr_type;
2907         int err;
2908
2909         fib6_nh->fib_nh_family = AF_INET6;
2910
2911         err = -ENODEV;
2912         if (cfg->fc_ifindex) {
2913                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2914                 if (!dev)
2915                         goto out;
2916                 idev = in6_dev_get(dev);
2917                 if (!idev)
2918                         goto out;
2919         }
2920
2921         if (cfg->fc_flags & RTNH_F_ONLINK) {
2922                 if (!dev) {
2923                         NL_SET_ERR_MSG(extack,
2924                                        "Nexthop device required for onlink");
2925                         goto out;
2926                 }
2927
2928                 if (!(dev->flags & IFF_UP)) {
2929                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2930                         err = -ENETDOWN;
2931                         goto out;
2932                 }
2933
2934                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2935         }
2936
2937         if (cfg->fc_encap) {
2938                 struct lwtunnel_state *lwtstate;
2939
2940                 err = lwtunnel_build_state(cfg->fc_encap_type,
2941                                            cfg->fc_encap, AF_INET6, cfg,
2942                                            &lwtstate, extack);
2943                 if (err)
2944                         goto out;
2945
2946                 fib6_nh->fib_nh_lws = lwtstate_get(lwtstate);
2947         }
2948
2949         fib6_nh->fib_nh_weight = 1;
2950
2951         /* We cannot add true routes via loopback here,
2952          * they would result in kernel looping; promote them to reject routes
2953          */
2954         addr_type = ipv6_addr_type(&cfg->fc_dst);
2955         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2956                 /* hold loopback dev/idev if we haven't done so. */
2957                 if (dev != net->loopback_dev) {
2958                         if (dev) {
2959                                 dev_put(dev);
2960                                 in6_dev_put(idev);
2961                         }
2962                         dev = net->loopback_dev;
2963                         dev_hold(dev);
2964                         idev = in6_dev_get(dev);
2965                         if (!idev) {
2966                                 err = -ENODEV;
2967                                 goto out;
2968                         }
2969                 }
2970                 goto set_dev;
2971         }
2972
2973         if (cfg->fc_flags & RTF_GATEWAY) {
2974                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2975                 if (err)
2976                         goto out;
2977
2978                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2979                 fib6_nh->fib_nh_has_gw = 1;
2980         }
2981
2982         err = -ENODEV;
2983         if (!dev)
2984                 goto out;
2985
2986         if (idev->cnf.disable_ipv6) {
2987                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2988                 err = -EACCES;
2989                 goto out;
2990         }
2991
2992         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
2993                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2994                 err = -ENETDOWN;
2995                 goto out;
2996         }
2997
2998         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2999             !netif_carrier_ok(dev))
3000                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3001
3002 set_dev:
3003         fib6_nh->fib_nh_dev = dev;
3004         fib6_nh->fib_nh_oif = dev->ifindex;
3005         err = 0;
3006 out:
3007         if (idev)
3008                 in6_dev_put(idev);
3009
3010         if (err) {
3011                 lwtstate_put(fib6_nh->fib_nh_lws);
3012                 fib6_nh->fib_nh_lws = NULL;
3013                 if (dev)
3014                         dev_put(dev);
3015         }
3016
3017         return err;
3018 }
3019
3020 void fib6_nh_release(struct fib6_nh *fib6_nh)
3021 {
3022         lwtstate_put(fib6_nh->fib_nh_lws);
3023
3024         if (fib6_nh->fib_nh_dev)
3025                 dev_put(fib6_nh->fib_nh_dev);
3026 }
3027
3028 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3029                                               gfp_t gfp_flags,
3030                                               struct netlink_ext_ack *extack)
3031 {
3032         struct net *net = cfg->fc_nlinfo.nl_net;
3033         struct fib6_info *rt = NULL;
3034         struct fib6_table *table;
3035         int err = -EINVAL;
3036         int addr_type;
3037
3038         /* RTF_PCPU is an internal flag; can not be set by userspace */
3039         if (cfg->fc_flags & RTF_PCPU) {
3040                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3041                 goto out;
3042         }
3043
3044         /* RTF_CACHE is an internal flag; can not be set by userspace */
3045         if (cfg->fc_flags & RTF_CACHE) {
3046                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3047                 goto out;
3048         }
3049
3050         if (cfg->fc_type > RTN_MAX) {
3051                 NL_SET_ERR_MSG(extack, "Invalid route type");
3052                 goto out;
3053         }
3054
3055         if (cfg->fc_dst_len > 128) {
3056                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3057                 goto out;
3058         }
3059         if (cfg->fc_src_len > 128) {
3060                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3061                 goto out;
3062         }
3063 #ifndef CONFIG_IPV6_SUBTREES
3064         if (cfg->fc_src_len) {
3065                 NL_SET_ERR_MSG(extack,
3066                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3067                 goto out;
3068         }
3069 #endif
3070
3071         err = -ENOBUFS;
3072         if (cfg->fc_nlinfo.nlh &&
3073             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3074                 table = fib6_get_table(net, cfg->fc_table);
3075                 if (!table) {
3076                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3077                         table = fib6_new_table(net, cfg->fc_table);
3078                 }
3079         } else {
3080                 table = fib6_new_table(net, cfg->fc_table);
3081         }
3082
3083         if (!table)
3084                 goto out;
3085
3086         err = -ENOMEM;
3087         rt = fib6_info_alloc(gfp_flags);
3088         if (!rt)
3089                 goto out;
3090
3091         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3092                                                extack);
3093         if (IS_ERR(rt->fib6_metrics)) {
3094                 err = PTR_ERR(rt->fib6_metrics);
3095                 /* Do not leave garbage there. */
3096                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3097                 goto out;
3098         }
3099
3100         if (cfg->fc_flags & RTF_ADDRCONF)
3101                 rt->dst_nocount = true;
3102
3103         if (cfg->fc_flags & RTF_EXPIRES)
3104                 fib6_set_expires(rt, jiffies +
3105                                 clock_t_to_jiffies(cfg->fc_expires));
3106         else
3107                 fib6_clean_expires(rt);
3108
3109         if (cfg->fc_protocol == RTPROT_UNSPEC)
3110                 cfg->fc_protocol = RTPROT_BOOT;
3111         rt->fib6_protocol = cfg->fc_protocol;
3112
3113         rt->fib6_table = table;
3114         rt->fib6_metric = cfg->fc_metric;
3115         rt->fib6_type = cfg->fc_type;
3116         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3117
3118         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3119         rt->fib6_dst.plen = cfg->fc_dst_len;
3120         if (rt->fib6_dst.plen == 128)
3121                 rt->dst_host = true;
3122
3123 #ifdef CONFIG_IPV6_SUBTREES
3124         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3125         rt->fib6_src.plen = cfg->fc_src_len;
3126 #endif
3127         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3128         if (err)
3129                 goto out;
3130
3131         /* We cannot add true routes via loopback here,
3132          * they would result in kernel looping; promote them to reject routes
3133          */
3134         addr_type = ipv6_addr_type(&cfg->fc_dst);
3135         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3136                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3137
3138         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3139                 struct net_device *dev = fib6_info_nh_dev(rt);
3140
3141                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3142                         NL_SET_ERR_MSG(extack, "Invalid source address");
3143                         err = -EINVAL;
3144                         goto out;
3145                 }
3146                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3147                 rt->fib6_prefsrc.plen = 128;
3148         } else
3149                 rt->fib6_prefsrc.plen = 0;
3150
3151         return rt;
3152 out:
3153         fib6_info_release(rt);
3154         return ERR_PTR(err);
3155 }
3156
3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3158                   struct netlink_ext_ack *extack)
3159 {
3160         struct fib6_info *rt;
3161         int err;
3162
3163         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3164         if (IS_ERR(rt))
3165                 return PTR_ERR(rt);
3166
3167         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3168         fib6_info_release(rt);
3169
3170         return err;
3171 }
3172
3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3174 {
3175         struct net *net = info->nl_net;
3176         struct fib6_table *table;
3177         int err;
3178
3179         if (rt == net->ipv6.fib6_null_entry) {
3180                 err = -ENOENT;
3181                 goto out;
3182         }
3183
3184         table = rt->fib6_table;
3185         spin_lock_bh(&table->tb6_lock);
3186         err = fib6_del(rt, info);
3187         spin_unlock_bh(&table->tb6_lock);
3188
3189 out:
3190         fib6_info_release(rt);
3191         return err;
3192 }
3193
3194 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3195 {
3196         struct nl_info info = { .nl_net = net };
3197
3198         return __ip6_del_rt(rt, &info);
3199 }
3200
3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3202 {
3203         struct nl_info *info = &cfg->fc_nlinfo;
3204         struct net *net = info->nl_net;
3205         struct sk_buff *skb = NULL;
3206         struct fib6_table *table;
3207         int err = -ENOENT;
3208
3209         if (rt == net->ipv6.fib6_null_entry)
3210                 goto out_put;
3211         table = rt->fib6_table;
3212         spin_lock_bh(&table->tb6_lock);
3213
3214         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3215                 struct fib6_info *sibling, *next_sibling;
3216
3217                 /* prefer to send a single notification with all hops */
3218                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3219                 if (skb) {
3220                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3221
3222                         if (rt6_fill_node(net, skb, rt, NULL,
3223                                           NULL, NULL, 0, RTM_DELROUTE,
3224                                           info->portid, seq, 0) < 0) {
3225                                 kfree_skb(skb);
3226                                 skb = NULL;
3227                         } else
3228                                 info->skip_notify = 1;
3229                 }
3230
3231                 list_for_each_entry_safe(sibling, next_sibling,
3232                                          &rt->fib6_siblings,
3233                                          fib6_siblings) {
3234                         err = fib6_del(sibling, info);
3235                         if (err)
3236                                 goto out_unlock;
3237                 }
3238         }
3239
3240         err = fib6_del(rt, info);
3241 out_unlock:
3242         spin_unlock_bh(&table->tb6_lock);
3243 out_put:
3244         fib6_info_release(rt);
3245
3246         if (skb) {
3247                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3248                             info->nlh, gfp_any());
3249         }
3250         return err;
3251 }
3252
3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3254 {
3255         int rc = -ESRCH;
3256
3257         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3258                 goto out;
3259
3260         if (cfg->fc_flags & RTF_GATEWAY &&
3261             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3262                 goto out;
3263
3264         rc = rt6_remove_exception_rt(rt);
3265 out:
3266         return rc;
3267 }
3268
3269 static int ip6_route_del(struct fib6_config *cfg,
3270                          struct netlink_ext_ack *extack)
3271 {
3272         struct rt6_info *rt_cache;
3273         struct fib6_table *table;
3274         struct fib6_info *rt;
3275         struct fib6_node *fn;
3276         int err = -ESRCH;
3277
3278         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3279         if (!table) {
3280                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3281                 return err;
3282         }
3283
3284         rcu_read_lock();
3285
3286         fn = fib6_locate(&table->tb6_root,
3287                          &cfg->fc_dst, cfg->fc_dst_len,
3288                          &cfg->fc_src, cfg->fc_src_len,
3289                          !(cfg->fc_flags & RTF_CACHE));
3290
3291         if (fn) {
3292                 for_each_fib6_node_rt_rcu(fn) {
3293                         struct fib6_nh *nh;
3294
3295                         if (cfg->fc_flags & RTF_CACHE) {
3296                                 int rc;
3297
3298                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3299                                                               &cfg->fc_src);
3300                                 if (rt_cache) {
3301                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3302                                         if (rc != -ESRCH) {
3303                                                 rcu_read_unlock();
3304                                                 return rc;
3305                                         }
3306                                 }
3307                                 continue;
3308                         }
3309
3310                         nh = &rt->fib6_nh;
3311                         if (cfg->fc_ifindex &&
3312                             (!nh->fib_nh_dev ||
3313                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3314                                 continue;
3315                         if (cfg->fc_flags & RTF_GATEWAY &&
3316                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3317                                 continue;
3318                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3319                                 continue;
3320                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3321                                 continue;
3322                         if (!fib6_info_hold_safe(rt))
3323                                 continue;
3324                         rcu_read_unlock();
3325
3326                         /* if gateway was specified only delete the one hop */
3327                         if (cfg->fc_flags & RTF_GATEWAY)
3328                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3329
3330                         return __ip6_del_rt_siblings(rt, cfg);
3331                 }
3332         }
3333         rcu_read_unlock();
3334
3335         return err;
3336 }
3337
3338 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3339 {
3340         struct netevent_redirect netevent;
3341         struct rt6_info *rt, *nrt = NULL;
3342         struct ndisc_options ndopts;
3343         struct inet6_dev *in6_dev;
3344         struct neighbour *neigh;
3345         struct fib6_info *from;
3346         struct rd_msg *msg;
3347         int optlen, on_link;
3348         u8 *lladdr;
3349
3350         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3351         optlen -= sizeof(*msg);
3352
3353         if (optlen < 0) {
3354                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3355                 return;
3356         }
3357
3358         msg = (struct rd_msg *)icmp6_hdr(skb);
3359
3360         if (ipv6_addr_is_multicast(&msg->dest)) {
3361                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3362                 return;
3363         }
3364
3365         on_link = 0;
3366         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3367                 on_link = 1;
3368         } else if (ipv6_addr_type(&msg->target) !=
3369                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3370                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3371                 return;
3372         }
3373
3374         in6_dev = __in6_dev_get(skb->dev);
3375         if (!in6_dev)
3376                 return;
3377         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3378                 return;
3379
3380         /* RFC2461 8.1:
3381          *      The IP source address of the Redirect MUST be the same as the current
3382          *      first-hop router for the specified ICMP Destination Address.
3383          */
3384
3385         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3386                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3387                 return;
3388         }
3389
3390         lladdr = NULL;
3391         if (ndopts.nd_opts_tgt_lladdr) {
3392                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3393                                              skb->dev);
3394                 if (!lladdr) {
3395                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3396                         return;
3397                 }
3398         }
3399
3400         rt = (struct rt6_info *) dst;
3401         if (rt->rt6i_flags & RTF_REJECT) {
3402                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3403                 return;
3404         }
3405
3406         /* Redirect received -> path was valid.
3407          * Look, redirects are sent only in response to data packets,
3408          * so that this nexthop apparently is reachable. --ANK
3409          */
3410         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3411
3412         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3413         if (!neigh)
3414                 return;
3415
3416         /*
3417          *      We have finally decided to accept it.
3418          */
3419
3420         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3421                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3422                      NEIGH_UPDATE_F_OVERRIDE|
3423                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3424                                      NEIGH_UPDATE_F_ISROUTER)),
3425                      NDISC_REDIRECT, &ndopts);
3426
3427         rcu_read_lock();
3428         from = rcu_dereference(rt->from);
3429         /* This fib6_info_hold() is safe here because we hold reference to rt
3430          * and rt already holds reference to fib6_info.
3431          */
3432         fib6_info_hold(from);
3433         rcu_read_unlock();
3434
3435         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3436         if (!nrt)
3437                 goto out;
3438
3439         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3440         if (on_link)
3441                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3442
3443         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3444
3445         /* No need to remove rt from the exception table if rt is
3446          * a cached route because rt6_insert_exception() will
3447          * takes care of it
3448          */
3449         if (rt6_insert_exception(nrt, from)) {
3450                 dst_release_immediate(&nrt->dst);
3451                 goto out;
3452         }
3453
3454         netevent.old = &rt->dst;
3455         netevent.new = &nrt->dst;
3456         netevent.daddr = &msg->dest;
3457         netevent.neigh = neigh;
3458         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3459
3460 out:
3461         fib6_info_release(from);
3462         neigh_release(neigh);
3463 }
3464
3465 #ifdef CONFIG_IPV6_ROUTE_INFO
3466 static struct fib6_info *rt6_get_route_info(struct net *net,
3467                                            const struct in6_addr *prefix, int prefixlen,
3468                                            const struct in6_addr *gwaddr,
3469                                            struct net_device *dev)
3470 {
3471         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3472         int ifindex = dev->ifindex;
3473         struct fib6_node *fn;
3474         struct fib6_info *rt = NULL;
3475         struct fib6_table *table;
3476
3477         table = fib6_get_table(net, tb_id);
3478         if (!table)
3479                 return NULL;
3480
3481         rcu_read_lock();
3482         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3483         if (!fn)
3484                 goto out;
3485
3486         for_each_fib6_node_rt_rcu(fn) {
3487                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3488                         continue;
3489                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3490                     !rt->fib6_nh.fib_nh_has_gw)
3491                         continue;
3492                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3493                         continue;
3494                 if (!fib6_info_hold_safe(rt))
3495                         continue;
3496                 break;
3497         }
3498 out:
3499         rcu_read_unlock();
3500         return rt;
3501 }
3502
3503 static struct fib6_info *rt6_add_route_info(struct net *net,
3504                                            const struct in6_addr *prefix, int prefixlen,
3505                                            const struct in6_addr *gwaddr,
3506                                            struct net_device *dev,
3507                                            unsigned int pref)
3508 {
3509         struct fib6_config cfg = {
3510                 .fc_metric      = IP6_RT_PRIO_USER,
3511                 .fc_ifindex     = dev->ifindex,
3512                 .fc_dst_len     = prefixlen,
3513                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3514                                   RTF_UP | RTF_PREF(pref),
3515                 .fc_protocol = RTPROT_RA,
3516                 .fc_type = RTN_UNICAST,
3517                 .fc_nlinfo.portid = 0,
3518                 .fc_nlinfo.nlh = NULL,
3519                 .fc_nlinfo.nl_net = net,
3520         };
3521
3522         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3523         cfg.fc_dst = *prefix;
3524         cfg.fc_gateway = *gwaddr;
3525
3526         /* We should treat it as a default route if prefix length is 0. */
3527         if (!prefixlen)
3528                 cfg.fc_flags |= RTF_DEFAULT;
3529
3530         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3531
3532         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3533 }
3534 #endif
3535
3536 struct fib6_info *rt6_get_dflt_router(struct net *net,
3537                                      const struct in6_addr *addr,
3538                                      struct net_device *dev)
3539 {
3540         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3541         struct fib6_info *rt;
3542         struct fib6_table *table;
3543
3544         table = fib6_get_table(net, tb_id);
3545         if (!table)
3546                 return NULL;
3547
3548         rcu_read_lock();
3549         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3550                 struct fib6_nh *nh = &rt->fib6_nh;
3551
3552                 if (dev == nh->fib_nh_dev &&
3553                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3554                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3555                         break;
3556         }
3557         if (rt && !fib6_info_hold_safe(rt))
3558                 rt = NULL;
3559         rcu_read_unlock();
3560         return rt;
3561 }
3562
3563 struct fib6_info *rt6_add_dflt_router(struct net *net,
3564                                      const struct in6_addr *gwaddr,
3565                                      struct net_device *dev,
3566                                      unsigned int pref)
3567 {
3568         struct fib6_config cfg = {
3569                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3570                 .fc_metric      = IP6_RT_PRIO_USER,
3571                 .fc_ifindex     = dev->ifindex,
3572                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3573                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3574                 .fc_protocol = RTPROT_RA,
3575                 .fc_type = RTN_UNICAST,
3576                 .fc_nlinfo.portid = 0,
3577                 .fc_nlinfo.nlh = NULL,
3578                 .fc_nlinfo.nl_net = net,
3579         };
3580
3581         cfg.fc_gateway = *gwaddr;
3582
3583         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3584                 struct fib6_table *table;
3585
3586                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3587                 if (table)
3588                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3589         }
3590
3591         return rt6_get_dflt_router(net, gwaddr, dev);
3592 }
3593
3594 static void __rt6_purge_dflt_routers(struct net *net,
3595                                      struct fib6_table *table)
3596 {
3597         struct fib6_info *rt;
3598
3599 restart:
3600         rcu_read_lock();
3601         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3602                 struct net_device *dev = fib6_info_nh_dev(rt);
3603                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3604
3605                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3606                     (!idev || idev->cnf.accept_ra != 2) &&
3607                     fib6_info_hold_safe(rt)) {
3608                         rcu_read_unlock();
3609                         ip6_del_rt(net, rt);
3610                         goto restart;
3611                 }
3612         }
3613         rcu_read_unlock();
3614
3615         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3616 }
3617
3618 void rt6_purge_dflt_routers(struct net *net)
3619 {
3620         struct fib6_table *table;
3621         struct hlist_head *head;
3622         unsigned int h;
3623
3624         rcu_read_lock();
3625
3626         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3627                 head = &net->ipv6.fib_table_hash[h];
3628                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3629                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3630                                 __rt6_purge_dflt_routers(net, table);
3631                 }
3632         }
3633
3634         rcu_read_unlock();
3635 }
3636
3637 static void rtmsg_to_fib6_config(struct net *net,
3638                                  struct in6_rtmsg *rtmsg,
3639                                  struct fib6_config *cfg)
3640 {
3641         *cfg = (struct fib6_config){
3642                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3643                          : RT6_TABLE_MAIN,
3644                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3645                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3646                 .fc_expires = rtmsg->rtmsg_info,
3647                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3648                 .fc_src_len = rtmsg->rtmsg_src_len,
3649                 .fc_flags = rtmsg->rtmsg_flags,
3650                 .fc_type = rtmsg->rtmsg_type,
3651
3652                 .fc_nlinfo.nl_net = net,
3653
3654                 .fc_dst = rtmsg->rtmsg_dst,
3655                 .fc_src = rtmsg->rtmsg_src,
3656                 .fc_gateway = rtmsg->rtmsg_gateway,
3657         };
3658 }
3659
3660 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3661 {
3662         struct fib6_config cfg;
3663         struct in6_rtmsg rtmsg;
3664         int err;
3665
3666         switch (cmd) {
3667         case SIOCADDRT:         /* Add a route */
3668         case SIOCDELRT:         /* Delete a route */
3669                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3670                         return -EPERM;
3671                 err = copy_from_user(&rtmsg, arg,
3672                                      sizeof(struct in6_rtmsg));
3673                 if (err)
3674                         return -EFAULT;
3675
3676                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3677
3678                 rtnl_lock();
3679                 switch (cmd) {
3680                 case SIOCADDRT:
3681                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3682                         break;
3683                 case SIOCDELRT:
3684                         err = ip6_route_del(&cfg, NULL);
3685                         break;
3686                 default:
3687                         err = -EINVAL;
3688                 }
3689                 rtnl_unlock();
3690
3691                 return err;
3692         }
3693
3694         return -EINVAL;
3695 }
3696
3697 /*
3698  *      Drop the packet on the floor
3699  */
3700
3701 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3702 {
3703         int type;
3704         struct dst_entry *dst = skb_dst(skb);
3705         switch (ipstats_mib_noroutes) {
3706         case IPSTATS_MIB_INNOROUTES:
3707                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3708                 if (type == IPV6_ADDR_ANY) {
3709                         IP6_INC_STATS(dev_net(dst->dev),
3710                                       __in6_dev_get_safely(skb->dev),
3711                                       IPSTATS_MIB_INADDRERRORS);
3712                         break;
3713                 }
3714                 /* FALLTHROUGH */
3715         case IPSTATS_MIB_OUTNOROUTES:
3716                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3717                               ipstats_mib_noroutes);
3718                 break;
3719         }
3720         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3721         kfree_skb(skb);
3722         return 0;
3723 }
3724
3725 static int ip6_pkt_discard(struct sk_buff *skb)
3726 {
3727         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3728 }
3729
3730 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3731 {
3732         skb->dev = skb_dst(skb)->dev;
3733         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3734 }
3735
3736 static int ip6_pkt_prohibit(struct sk_buff *skb)
3737 {
3738         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3739 }
3740
3741 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3742 {
3743         skb->dev = skb_dst(skb)->dev;
3744         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3745 }
3746
3747 /*
3748  *      Allocate a dst for local (unicast / anycast) address.
3749  */
3750
3751 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3752                                      struct inet6_dev *idev,
3753                                      const struct in6_addr *addr,
3754                                      bool anycast, gfp_t gfp_flags)
3755 {
3756         struct fib6_config cfg = {
3757                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3758                 .fc_ifindex = idev->dev->ifindex,
3759                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3760                 .fc_dst = *addr,
3761                 .fc_dst_len = 128,
3762                 .fc_protocol = RTPROT_KERNEL,
3763                 .fc_nlinfo.nl_net = net,
3764                 .fc_ignore_dev_down = true,
3765         };
3766
3767         if (anycast) {
3768                 cfg.fc_type = RTN_ANYCAST;
3769                 cfg.fc_flags |= RTF_ANYCAST;
3770         } else {
3771                 cfg.fc_type = RTN_LOCAL;
3772                 cfg.fc_flags |= RTF_LOCAL;
3773         }
3774
3775         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3776 }
3777
3778 /* remove deleted ip from prefsrc entries */
3779 struct arg_dev_net_ip {
3780         struct net_device *dev;
3781         struct net *net;
3782         struct in6_addr *addr;
3783 };
3784
3785 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3786 {
3787         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3788         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3789         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3790
3791         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3792             rt != net->ipv6.fib6_null_entry &&
3793             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3794                 spin_lock_bh(&rt6_exception_lock);
3795                 /* remove prefsrc entry */
3796                 rt->fib6_prefsrc.plen = 0;
3797                 spin_unlock_bh(&rt6_exception_lock);
3798         }
3799         return 0;
3800 }
3801
3802 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3803 {
3804         struct net *net = dev_net(ifp->idev->dev);
3805         struct arg_dev_net_ip adni = {
3806                 .dev = ifp->idev->dev,
3807                 .net = net,
3808                 .addr = &ifp->addr,
3809         };
3810         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3811 }
3812
3813 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3814
3815 /* Remove routers and update dst entries when gateway turn into host. */
3816 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3817 {
3818         struct in6_addr *gateway = (struct in6_addr *)arg;
3819
3820         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3821             rt->fib6_nh.fib_nh_has_gw &&
3822             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3823                 return -1;
3824         }
3825
3826         /* Further clean up cached routes in exception table.
3827          * This is needed because cached route may have a different
3828          * gateway than its 'parent' in the case of an ip redirect.
3829          */
3830         rt6_exceptions_clean_tohost(rt, gateway);
3831
3832         return 0;
3833 }
3834
3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3836 {
3837         fib6_clean_all(net, fib6_clean_tohost, gateway);
3838 }
3839
3840 struct arg_netdev_event {
3841         const struct net_device *dev;
3842         union {
3843                 unsigned int nh_flags;
3844                 unsigned long event;
3845         };
3846 };
3847
3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3849 {
3850         struct fib6_info *iter;
3851         struct fib6_node *fn;
3852
3853         fn = rcu_dereference_protected(rt->fib6_node,
3854                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3855         iter = rcu_dereference_protected(fn->leaf,
3856                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3857         while (iter) {
3858                 if (iter->fib6_metric == rt->fib6_metric &&
3859                     rt6_qualify_for_ecmp(iter))
3860                         return iter;
3861                 iter = rcu_dereference_protected(iter->fib6_next,
3862                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3863         }
3864
3865         return NULL;
3866 }
3867
3868 static bool rt6_is_dead(const struct fib6_info *rt)
3869 {
3870         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3871             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3872              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3873                 return true;
3874
3875         return false;
3876 }
3877
3878 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3879 {
3880         struct fib6_info *iter;
3881         int total = 0;
3882
3883         if (!rt6_is_dead(rt))
3884                 total += rt->fib6_nh.fib_nh_weight;
3885
3886         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3887                 if (!rt6_is_dead(iter))
3888                         total += iter->fib6_nh.fib_nh_weight;
3889         }
3890
3891         return total;
3892 }
3893
3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3895 {
3896         int upper_bound = -1;
3897
3898         if (!rt6_is_dead(rt)) {
3899                 *weight += rt->fib6_nh.fib_nh_weight;
3900                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3901                                                     total) - 1;
3902         }
3903         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3904 }
3905
3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3907 {
3908         struct fib6_info *iter;
3909         int weight = 0;
3910
3911         rt6_upper_bound_set(rt, &weight, total);
3912
3913         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3914                 rt6_upper_bound_set(iter, &weight, total);
3915 }
3916
3917 void rt6_multipath_rebalance(struct fib6_info *rt)
3918 {
3919         struct fib6_info *first;
3920         int total;
3921
3922         /* In case the entire multipath route was marked for flushing,
3923          * then there is no need to rebalance upon the removal of every
3924          * sibling route.
3925          */
3926         if (!rt->fib6_nsiblings || rt->should_flush)
3927                 return;
3928
3929         /* During lookup routes are evaluated in order, so we need to
3930          * make sure upper bounds are assigned from the first sibling
3931          * onwards.
3932          */
3933         first = rt6_multipath_first_sibling(rt);
3934         if (WARN_ON_ONCE(!first))
3935                 return;
3936
3937         total = rt6_multipath_total_weight(first);
3938         rt6_multipath_upper_bound_set(first, total);
3939 }
3940
3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3942 {
3943         const struct arg_netdev_event *arg = p_arg;
3944         struct net *net = dev_net(arg->dev);
3945
3946         if (rt != net->ipv6.fib6_null_entry &&
3947             rt->fib6_nh.fib_nh_dev == arg->dev) {
3948                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3949                 fib6_update_sernum_upto_root(net, rt);
3950                 rt6_multipath_rebalance(rt);
3951         }
3952
3953         return 0;
3954 }
3955
3956 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3957 {
3958         struct arg_netdev_event arg = {
3959                 .dev = dev,
3960                 {
3961                         .nh_flags = nh_flags,
3962                 },
3963         };
3964
3965         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3966                 arg.nh_flags |= RTNH_F_LINKDOWN;
3967
3968         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3969 }
3970
3971 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3972                                    const struct net_device *dev)
3973 {
3974         struct fib6_info *iter;
3975
3976         if (rt->fib6_nh.fib_nh_dev == dev)
3977                 return true;
3978         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3979                 if (iter->fib6_nh.fib_nh_dev == dev)
3980                         return true;
3981
3982         return false;
3983 }
3984
3985 static void rt6_multipath_flush(struct fib6_info *rt)
3986 {
3987         struct fib6_info *iter;
3988
3989         rt->should_flush = 1;
3990         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3991                 iter->should_flush = 1;
3992 }
3993
3994 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3995                                              const struct net_device *down_dev)
3996 {
3997         struct fib6_info *iter;
3998         unsigned int dead = 0;
3999
4000         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4001             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4002                 dead++;
4003         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4004                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4005                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4006                         dead++;
4007
4008         return dead;
4009 }
4010
4011 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4012                                        const struct net_device *dev,
4013                                        unsigned int nh_flags)
4014 {
4015         struct fib6_info *iter;
4016
4017         if (rt->fib6_nh.fib_nh_dev == dev)
4018                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4019         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4020                 if (iter->fib6_nh.fib_nh_dev == dev)
4021                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4022 }
4023
4024 /* called with write lock held for table with rt */
4025 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4026 {
4027         const struct arg_netdev_event *arg = p_arg;
4028         const struct net_device *dev = arg->dev;
4029         struct net *net = dev_net(dev);
4030
4031         if (rt == net->ipv6.fib6_null_entry)
4032                 return 0;
4033
4034         switch (arg->event) {
4035         case NETDEV_UNREGISTER:
4036                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4037         case NETDEV_DOWN:
4038                 if (rt->should_flush)
4039                         return -1;
4040                 if (!rt->fib6_nsiblings)
4041                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4042                 if (rt6_multipath_uses_dev(rt, dev)) {
4043                         unsigned int count;
4044
4045                         count = rt6_multipath_dead_count(rt, dev);
4046                         if (rt->fib6_nsiblings + 1 == count) {
4047                                 rt6_multipath_flush(rt);
4048                                 return -1;
4049                         }
4050                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4051                                                    RTNH_F_LINKDOWN);
4052                         fib6_update_sernum(net, rt);
4053                         rt6_multipath_rebalance(rt);
4054                 }
4055                 return -2;
4056         case NETDEV_CHANGE:
4057                 if (rt->fib6_nh.fib_nh_dev != dev ||
4058                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4059                         break;
4060                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4061                 rt6_multipath_rebalance(rt);
4062                 break;
4063         }
4064
4065         return 0;
4066 }
4067
4068 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4069 {
4070         struct arg_netdev_event arg = {
4071                 .dev = dev,
4072                 {
4073                         .event = event,
4074                 },
4075         };
4076         struct net *net = dev_net(dev);
4077
4078         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4079                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4080         else
4081                 fib6_clean_all(net, fib6_ifdown, &arg);
4082 }
4083
4084 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4085 {
4086         rt6_sync_down_dev(dev, event);
4087         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4088         neigh_ifdown(&nd_tbl, dev);
4089 }
4090
4091 struct rt6_mtu_change_arg {
4092         struct net_device *dev;
4093         unsigned int mtu;
4094 };
4095
4096 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4097 {
4098         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4099         struct inet6_dev *idev;
4100
4101         /* In IPv6 pmtu discovery is not optional,
4102            so that RTAX_MTU lock cannot disable it.
4103            We still use this lock to block changes
4104            caused by addrconf/ndisc.
4105         */
4106
4107         idev = __in6_dev_get(arg->dev);
4108         if (!idev)
4109                 return 0;
4110
4111         /* For administrative MTU increase, there is no way to discover
4112            IPv6 PMTU increase, so PMTU increase should be updated here.
4113            Since RFC 1981 doesn't include administrative MTU increase
4114            update PMTU increase is a MUST. (i.e. jumbo frame)
4115          */
4116         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4117             !fib6_metric_locked(rt, RTAX_MTU)) {
4118                 u32 mtu = rt->fib6_pmtu;
4119
4120                 if (mtu >= arg->mtu ||
4121                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4122                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4123
4124                 spin_lock_bh(&rt6_exception_lock);
4125                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4126                 spin_unlock_bh(&rt6_exception_lock);
4127         }
4128         return 0;
4129 }
4130
4131 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4132 {
4133         struct rt6_mtu_change_arg arg = {
4134                 .dev = dev,
4135                 .mtu = mtu,
4136         };
4137
4138         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4139 }
4140
4141 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4142         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4143         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4144         [RTA_OIF]               = { .type = NLA_U32 },
4145         [RTA_IIF]               = { .type = NLA_U32 },
4146         [RTA_PRIORITY]          = { .type = NLA_U32 },
4147         [RTA_METRICS]           = { .type = NLA_NESTED },
4148         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4149         [RTA_PREF]              = { .type = NLA_U8 },
4150         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4151         [RTA_ENCAP]             = { .type = NLA_NESTED },
4152         [RTA_EXPIRES]           = { .type = NLA_U32 },
4153         [RTA_UID]               = { .type = NLA_U32 },
4154         [RTA_MARK]              = { .type = NLA_U32 },
4155         [RTA_TABLE]             = { .type = NLA_U32 },
4156         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4157         [RTA_SPORT]             = { .type = NLA_U16 },
4158         [RTA_DPORT]             = { .type = NLA_U16 },
4159 };
4160
4161 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4162                               struct fib6_config *cfg,
4163                               struct netlink_ext_ack *extack)
4164 {
4165         struct rtmsg *rtm;
4166         struct nlattr *tb[RTA_MAX+1];
4167         unsigned int pref;
4168         int err;
4169
4170         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4171                           extack);
4172         if (err < 0)
4173                 goto errout;
4174
4175         err = -EINVAL;
4176         rtm = nlmsg_data(nlh);
4177
4178         *cfg = (struct fib6_config){
4179                 .fc_table = rtm->rtm_table,
4180                 .fc_dst_len = rtm->rtm_dst_len,
4181                 .fc_src_len = rtm->rtm_src_len,
4182                 .fc_flags = RTF_UP,
4183                 .fc_protocol = rtm->rtm_protocol,
4184                 .fc_type = rtm->rtm_type,
4185
4186                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4187                 .fc_nlinfo.nlh = nlh,
4188                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4189         };
4190
4191         if (rtm->rtm_type == RTN_UNREACHABLE ||
4192             rtm->rtm_type == RTN_BLACKHOLE ||
4193             rtm->rtm_type == RTN_PROHIBIT ||
4194             rtm->rtm_type == RTN_THROW)
4195                 cfg->fc_flags |= RTF_REJECT;
4196
4197         if (rtm->rtm_type == RTN_LOCAL)
4198                 cfg->fc_flags |= RTF_LOCAL;
4199
4200         if (rtm->rtm_flags & RTM_F_CLONED)
4201                 cfg->fc_flags |= RTF_CACHE;
4202
4203         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4204
4205         if (tb[RTA_GATEWAY]) {
4206                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4207                 cfg->fc_flags |= RTF_GATEWAY;
4208         }
4209         if (tb[RTA_VIA]) {
4210                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4211                 goto errout;
4212         }
4213
4214         if (tb[RTA_DST]) {
4215                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4216
4217                 if (nla_len(tb[RTA_DST]) < plen)
4218                         goto errout;
4219
4220                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4221         }
4222
4223         if (tb[RTA_SRC]) {
4224                 int plen = (rtm->rtm_src_len + 7) >> 3;
4225
4226                 if (nla_len(tb[RTA_SRC]) < plen)
4227                         goto errout;
4228
4229                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4230         }
4231
4232         if (tb[RTA_PREFSRC])
4233                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4234
4235         if (tb[RTA_OIF])
4236                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4237
4238         if (tb[RTA_PRIORITY])
4239                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4240
4241         if (tb[RTA_METRICS]) {
4242                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4243                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4244         }
4245
4246         if (tb[RTA_TABLE])
4247                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4248
4249         if (tb[RTA_MULTIPATH]) {
4250                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4251                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4252
4253                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4254                                                      cfg->fc_mp_len, extack);
4255                 if (err < 0)
4256                         goto errout;
4257         }
4258
4259         if (tb[RTA_PREF]) {
4260                 pref = nla_get_u8(tb[RTA_PREF]);
4261                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4262                     pref != ICMPV6_ROUTER_PREF_HIGH)
4263                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4264                 cfg->fc_flags |= RTF_PREF(pref);
4265         }
4266
4267         if (tb[RTA_ENCAP])
4268                 cfg->fc_encap = tb[RTA_ENCAP];
4269
4270         if (tb[RTA_ENCAP_TYPE]) {
4271                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4272
4273                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4274                 if (err < 0)
4275                         goto errout;
4276         }
4277
4278         if (tb[RTA_EXPIRES]) {
4279                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4280
4281                 if (addrconf_finite_timeout(timeout)) {
4282                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4283                         cfg->fc_flags |= RTF_EXPIRES;
4284                 }
4285         }
4286
4287         err = 0;
4288 errout:
4289         return err;
4290 }
4291
4292 struct rt6_nh {
4293         struct fib6_info *fib6_info;
4294         struct fib6_config r_cfg;
4295         struct list_head next;
4296 };
4297
4298 static int ip6_route_info_append(struct net *net,
4299                                  struct list_head *rt6_nh_list,
4300                                  struct fib6_info *rt,
4301                                  struct fib6_config *r_cfg)
4302 {
4303         struct rt6_nh *nh;
4304         int err = -EEXIST;
4305
4306         list_for_each_entry(nh, rt6_nh_list, next) {
4307                 /* check if fib6_info already exists */
4308                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4309                         return err;
4310         }
4311
4312         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4313         if (!nh)
4314                 return -ENOMEM;
4315         nh->fib6_info = rt;
4316         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4317         list_add_tail(&nh->next, rt6_nh_list);
4318
4319         return 0;
4320 }
4321
4322 static void ip6_route_mpath_notify(struct fib6_info *rt,
4323                                    struct fib6_info *rt_last,
4324                                    struct nl_info *info,
4325                                    __u16 nlflags)
4326 {
4327         /* if this is an APPEND route, then rt points to the first route
4328          * inserted and rt_last points to last route inserted. Userspace
4329          * wants a consistent dump of the route which starts at the first
4330          * nexthop. Since sibling routes are always added at the end of
4331          * the list, find the first sibling of the last route appended
4332          */
4333         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4334                 rt = list_first_entry(&rt_last->fib6_siblings,
4335                                       struct fib6_info,
4336                                       fib6_siblings);
4337         }
4338
4339         if (rt)
4340                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4341 }
4342
4343 static int ip6_route_multipath_add(struct fib6_config *cfg,
4344                                    struct netlink_ext_ack *extack)
4345 {
4346         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4347         struct nl_info *info = &cfg->fc_nlinfo;
4348         struct fib6_config r_cfg;
4349         struct rtnexthop *rtnh;
4350         struct fib6_info *rt;
4351         struct rt6_nh *err_nh;
4352         struct rt6_nh *nh, *nh_safe;
4353         __u16 nlflags;
4354         int remaining;
4355         int attrlen;
4356         int err = 1;
4357         int nhn = 0;
4358         int replace = (cfg->fc_nlinfo.nlh &&
4359                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4360         LIST_HEAD(rt6_nh_list);
4361
4362         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4363         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4364                 nlflags |= NLM_F_APPEND;
4365
4366         remaining = cfg->fc_mp_len;
4367         rtnh = (struct rtnexthop *)cfg->fc_mp;
4368
4369         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4370          * fib6_info structs per nexthop
4371          */
4372         while (rtnh_ok(rtnh, remaining)) {
4373                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4374                 if (rtnh->rtnh_ifindex)
4375                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4376
4377                 attrlen = rtnh_attrlen(rtnh);
4378                 if (attrlen > 0) {
4379                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4380
4381                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4382                         if (nla) {
4383                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4384                                 r_cfg.fc_flags |= RTF_GATEWAY;
4385                         }
4386                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4387                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4388                         if (nla)
4389                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4390                 }
4391
4392                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4393                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4394                 if (IS_ERR(rt)) {
4395                         err = PTR_ERR(rt);
4396                         rt = NULL;
4397                         goto cleanup;
4398                 }
4399                 if (!rt6_qualify_for_ecmp(rt)) {
4400                         err = -EINVAL;
4401                         NL_SET_ERR_MSG(extack,
4402                                        "Device only routes can not be added for IPv6 using the multipath API.");
4403                         fib6_info_release(rt);
4404                         goto cleanup;
4405                 }
4406
4407                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4408
4409                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4410                                             rt, &r_cfg);
4411                 if (err) {
4412                         fib6_info_release(rt);
4413                         goto cleanup;
4414                 }
4415
4416                 rtnh = rtnh_next(rtnh, &remaining);
4417         }
4418
4419         /* for add and replace send one notification with all nexthops.
4420          * Skip the notification in fib6_add_rt2node and send one with
4421          * the full route when done
4422          */
4423         info->skip_notify = 1;
4424
4425         err_nh = NULL;
4426         list_for_each_entry(nh, &rt6_nh_list, next) {
4427                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4428                 fib6_info_release(nh->fib6_info);
4429
4430                 if (!err) {
4431                         /* save reference to last route successfully inserted */
4432                         rt_last = nh->fib6_info;
4433
4434                         /* save reference to first route for notification */
4435                         if (!rt_notif)
4436                                 rt_notif = nh->fib6_info;
4437                 }
4438
4439                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4440                 nh->fib6_info = NULL;
4441                 if (err) {
4442                         if (replace && nhn)
4443                                 NL_SET_ERR_MSG_MOD(extack,
4444                                                    "multipath route replace failed (check consistency of installed routes)");
4445                         err_nh = nh;
4446                         goto add_errout;
4447                 }
4448
4449                 /* Because each route is added like a single route we remove
4450                  * these flags after the first nexthop: if there is a collision,
4451                  * we have already failed to add the first nexthop:
4452                  * fib6_add_rt2node() has rejected it; when replacing, old
4453                  * nexthops have been replaced by first new, the rest should
4454                  * be added to it.
4455                  */
4456                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4457                                                      NLM_F_REPLACE);
4458                 nhn++;
4459         }
4460
4461         /* success ... tell user about new route */
4462         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4463         goto cleanup;
4464
4465 add_errout:
4466         /* send notification for routes that were added so that
4467          * the delete notifications sent by ip6_route_del are
4468          * coherent
4469          */
4470         if (rt_notif)
4471                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4472
4473         /* Delete routes that were already added */
4474         list_for_each_entry(nh, &rt6_nh_list, next) {
4475                 if (err_nh == nh)
4476                         break;
4477                 ip6_route_del(&nh->r_cfg, extack);
4478         }
4479
4480 cleanup:
4481         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4482                 if (nh->fib6_info)
4483                         fib6_info_release(nh->fib6_info);
4484                 list_del(&nh->next);
4485                 kfree(nh);
4486         }
4487
4488         return err;
4489 }
4490
4491 static int ip6_route_multipath_del(struct fib6_config *cfg,
4492                                    struct netlink_ext_ack *extack)
4493 {
4494         struct fib6_config r_cfg;
4495         struct rtnexthop *rtnh;
4496         int remaining;
4497         int attrlen;
4498         int err = 1, last_err = 0;
4499
4500         remaining = cfg->fc_mp_len;
4501         rtnh = (struct rtnexthop *)cfg->fc_mp;
4502
4503         /* Parse a Multipath Entry */
4504         while (rtnh_ok(rtnh, remaining)) {
4505                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4506                 if (rtnh->rtnh_ifindex)
4507                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4508
4509                 attrlen = rtnh_attrlen(rtnh);
4510                 if (attrlen > 0) {
4511                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4512
4513                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4514                         if (nla) {
4515                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4516                                 r_cfg.fc_flags |= RTF_GATEWAY;
4517                         }
4518                 }
4519                 err = ip6_route_del(&r_cfg, extack);
4520                 if (err)
4521                         last_err = err;
4522
4523                 rtnh = rtnh_next(rtnh, &remaining);
4524         }
4525
4526         return last_err;
4527 }
4528
4529 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4530                               struct netlink_ext_ack *extack)
4531 {
4532         struct fib6_config cfg;
4533         int err;
4534
4535         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4536         if (err < 0)
4537                 return err;
4538
4539         if (cfg.fc_mp)
4540                 return ip6_route_multipath_del(&cfg, extack);
4541         else {
4542                 cfg.fc_delete_all_nh = 1;
4543                 return ip6_route_del(&cfg, extack);
4544         }
4545 }
4546
4547 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4548                               struct netlink_ext_ack *extack)
4549 {
4550         struct fib6_config cfg;
4551         int err;
4552
4553         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4554         if (err < 0)
4555                 return err;
4556
4557         if (cfg.fc_metric == 0)
4558                 cfg.fc_metric = IP6_RT_PRIO_USER;
4559
4560         if (cfg.fc_mp)
4561                 return ip6_route_multipath_add(&cfg, extack);
4562         else
4563                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4564 }
4565
4566 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4567 {
4568         int nexthop_len = 0;
4569
4570         if (rt->fib6_nsiblings) {
4571                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4572                             + NLA_ALIGN(sizeof(struct rtnexthop))
4573                             + nla_total_size(16) /* RTA_GATEWAY */
4574                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4575
4576                 nexthop_len *= rt->fib6_nsiblings;
4577         }
4578
4579         return NLMSG_ALIGN(sizeof(struct rtmsg))
4580                + nla_total_size(16) /* RTA_SRC */
4581                + nla_total_size(16) /* RTA_DST */
4582                + nla_total_size(16) /* RTA_GATEWAY */
4583                + nla_total_size(16) /* RTA_PREFSRC */
4584                + nla_total_size(4) /* RTA_TABLE */
4585                + nla_total_size(4) /* RTA_IIF */
4586                + nla_total_size(4) /* RTA_OIF */
4587                + nla_total_size(4) /* RTA_PRIORITY */
4588                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4589                + nla_total_size(sizeof(struct rta_cacheinfo))
4590                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4591                + nla_total_size(1) /* RTA_PREF */
4592                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4593                + nexthop_len;
4594 }
4595
4596 static int rt6_nexthop_info(struct sk_buff *skb, const struct fib6_nh *fib6_nh,
4597                             unsigned int *flags, bool skip_oif)
4598 {
4599         if (fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4600                 *flags |= RTNH_F_DEAD;
4601
4602         if (fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
4603                 *flags |= RTNH_F_LINKDOWN;
4604
4605                 rcu_read_lock();
4606                 if (ip6_ignore_linkdown(fib6_nh->fib_nh_dev))
4607                         *flags |= RTNH_F_DEAD;
4608                 rcu_read_unlock();
4609         }
4610
4611         if (fib6_nh->fib_nh_has_gw) {
4612                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &fib6_nh->fib_nh_gw6) < 0)
4613                         goto nla_put_failure;
4614         }
4615
4616         *flags |= (fib6_nh->fib_nh_flags & RTNH_F_ONLINK);
4617         if (fib6_nh->fib_nh_flags & RTNH_F_OFFLOAD)
4618                 *flags |= RTNH_F_OFFLOAD;
4619
4620         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4621         if (!skip_oif && fib6_nh->fib_nh_dev &&
4622             nla_put_u32(skb, RTA_OIF, fib6_nh->fib_nh_dev->ifindex))
4623                 goto nla_put_failure;
4624
4625         if (fib6_nh->fib_nh_lws &&
4626             lwtunnel_fill_encap(skb, fib6_nh->fib_nh_lws) < 0)
4627                 goto nla_put_failure;
4628
4629         return 0;
4630
4631 nla_put_failure:
4632         return -EMSGSIZE;
4633 }
4634
4635 /* add multipath next hop */
4636 static int rt6_add_nexthop(struct sk_buff *skb, const struct fib6_nh *fib6_nh)
4637 {
4638         const struct net_device *dev = fib6_nh->fib_nh_dev;
4639         struct rtnexthop *rtnh;
4640         unsigned int flags = 0;
4641
4642         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4643         if (!rtnh)
4644                 goto nla_put_failure;
4645
4646         rtnh->rtnh_hops = fib6_nh->fib_nh_weight - 1;
4647         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4648
4649         if (rt6_nexthop_info(skb, fib6_nh, &flags, true) < 0)
4650                 goto nla_put_failure;
4651
4652         rtnh->rtnh_flags = flags;
4653
4654         /* length of rtnetlink header + attributes */
4655         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4656
4657         return 0;
4658
4659 nla_put_failure:
4660         return -EMSGSIZE;
4661 }
4662
4663 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4664                          struct fib6_info *rt, struct dst_entry *dst,
4665                          struct in6_addr *dest, struct in6_addr *src,
4666                          int iif, int type, u32 portid, u32 seq,
4667                          unsigned int flags)
4668 {
4669         struct rt6_info *rt6 = (struct rt6_info *)dst;
4670         struct rt6key *rt6_dst, *rt6_src;
4671         u32 *pmetrics, table, rt6_flags;
4672         struct nlmsghdr *nlh;
4673         struct rtmsg *rtm;
4674         long expires = 0;
4675
4676         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4677         if (!nlh)
4678                 return -EMSGSIZE;
4679
4680         if (rt6) {
4681                 rt6_dst = &rt6->rt6i_dst;
4682                 rt6_src = &rt6->rt6i_src;
4683                 rt6_flags = rt6->rt6i_flags;
4684         } else {
4685                 rt6_dst = &rt->fib6_dst;
4686                 rt6_src = &rt->fib6_src;
4687                 rt6_flags = rt->fib6_flags;
4688         }
4689
4690         rtm = nlmsg_data(nlh);
4691         rtm->rtm_family = AF_INET6;
4692         rtm->rtm_dst_len = rt6_dst->plen;
4693         rtm->rtm_src_len = rt6_src->plen;
4694         rtm->rtm_tos = 0;
4695         if (rt->fib6_table)
4696                 table = rt->fib6_table->tb6_id;
4697         else
4698                 table = RT6_TABLE_UNSPEC;
4699         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4700         if (nla_put_u32(skb, RTA_TABLE, table))
4701                 goto nla_put_failure;
4702
4703         rtm->rtm_type = rt->fib6_type;
4704         rtm->rtm_flags = 0;
4705         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4706         rtm->rtm_protocol = rt->fib6_protocol;
4707
4708         if (rt6_flags & RTF_CACHE)
4709                 rtm->rtm_flags |= RTM_F_CLONED;
4710
4711         if (dest) {
4712                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4713                         goto nla_put_failure;
4714                 rtm->rtm_dst_len = 128;
4715         } else if (rtm->rtm_dst_len)
4716                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4717                         goto nla_put_failure;
4718 #ifdef CONFIG_IPV6_SUBTREES
4719         if (src) {
4720                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4721                         goto nla_put_failure;
4722                 rtm->rtm_src_len = 128;
4723         } else if (rtm->rtm_src_len &&
4724                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4725                 goto nla_put_failure;
4726 #endif
4727         if (iif) {
4728 #ifdef CONFIG_IPV6_MROUTE
4729                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4730                         int err = ip6mr_get_route(net, skb, rtm, portid);
4731
4732                         if (err == 0)
4733                                 return 0;
4734                         if (err < 0)
4735                                 goto nla_put_failure;
4736                 } else
4737 #endif
4738                         if (nla_put_u32(skb, RTA_IIF, iif))
4739                                 goto nla_put_failure;
4740         } else if (dest) {
4741                 struct in6_addr saddr_buf;
4742                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4743                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4744                         goto nla_put_failure;
4745         }
4746
4747         if (rt->fib6_prefsrc.plen) {
4748                 struct in6_addr saddr_buf;
4749                 saddr_buf = rt->fib6_prefsrc.addr;
4750                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4751                         goto nla_put_failure;
4752         }
4753
4754         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4755         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4756                 goto nla_put_failure;
4757
4758         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4759                 goto nla_put_failure;
4760
4761         /* For multipath routes, walk the siblings list and add
4762          * each as a nexthop within RTA_MULTIPATH.
4763          */
4764         if (rt6) {
4765                 if (rt6_flags & RTF_GATEWAY &&
4766                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4767                         goto nla_put_failure;
4768
4769                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4770                         goto nla_put_failure;
4771         } else if (rt->fib6_nsiblings) {
4772                 struct fib6_info *sibling, *next_sibling;
4773                 struct nlattr *mp;
4774
4775                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4776                 if (!mp)
4777                         goto nla_put_failure;
4778
4779                 if (rt6_add_nexthop(skb, &rt->fib6_nh) < 0)
4780                         goto nla_put_failure;
4781
4782                 list_for_each_entry_safe(sibling, next_sibling,
4783                                          &rt->fib6_siblings, fib6_siblings) {
4784                         if (rt6_add_nexthop(skb, &sibling->fib6_nh) < 0)
4785                                 goto nla_put_failure;
4786                 }
4787
4788                 nla_nest_end(skb, mp);
4789         } else {
4790                 if (rt6_nexthop_info(skb, &rt->fib6_nh, &rtm->rtm_flags,
4791                                      false) < 0)
4792                         goto nla_put_failure;
4793         }
4794
4795         if (rt6_flags & RTF_EXPIRES) {
4796                 expires = dst ? dst->expires : rt->expires;
4797                 expires -= jiffies;
4798         }
4799
4800         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4801                 goto nla_put_failure;
4802
4803         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4804                 goto nla_put_failure;
4805
4806
4807         nlmsg_end(skb, nlh);
4808         return 0;
4809
4810 nla_put_failure:
4811         nlmsg_cancel(skb, nlh);
4812         return -EMSGSIZE;
4813 }
4814
4815 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4816                                const struct net_device *dev)
4817 {
4818         if (f6i->fib6_nh.fib_nh_dev == dev)
4819                 return true;
4820
4821         if (f6i->fib6_nsiblings) {
4822                 struct fib6_info *sibling, *next_sibling;
4823
4824                 list_for_each_entry_safe(sibling, next_sibling,
4825                                          &f6i->fib6_siblings, fib6_siblings) {
4826                         if (sibling->fib6_nh.fib_nh_dev == dev)
4827                                 return true;
4828                 }
4829         }
4830
4831         return false;
4832 }
4833
4834 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4835 {
4836         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4837         struct fib_dump_filter *filter = &arg->filter;
4838         unsigned int flags = NLM_F_MULTI;
4839         struct net *net = arg->net;
4840
4841         if (rt == net->ipv6.fib6_null_entry)
4842                 return 0;
4843
4844         if ((filter->flags & RTM_F_PREFIX) &&
4845             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4846                 /* success since this is not a prefix route */
4847                 return 1;
4848         }
4849         if (filter->filter_set) {
4850                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4851                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4852                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4853                         return 1;
4854                 }
4855                 flags |= NLM_F_DUMP_FILTERED;
4856         }
4857
4858         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4859                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4860                              arg->cb->nlh->nlmsg_seq, flags);
4861 }
4862
4863 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4864                                         const struct nlmsghdr *nlh,
4865                                         struct nlattr **tb,
4866                                         struct netlink_ext_ack *extack)
4867 {
4868         struct rtmsg *rtm;
4869         int i, err;
4870
4871         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4872                 NL_SET_ERR_MSG_MOD(extack,
4873                                    "Invalid header for get route request");
4874                 return -EINVAL;
4875         }
4876
4877         if (!netlink_strict_get_check(skb))
4878                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4879                                    rtm_ipv6_policy, extack);
4880
4881         rtm = nlmsg_data(nlh);
4882         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4883             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4884             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4885             rtm->rtm_type) {
4886                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4887                 return -EINVAL;
4888         }
4889         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4890                 NL_SET_ERR_MSG_MOD(extack,
4891                                    "Invalid flags for get route request");
4892                 return -EINVAL;
4893         }
4894
4895         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4896                                  rtm_ipv6_policy, extack);
4897         if (err)
4898                 return err;
4899
4900         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4901             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4902                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4903                 return -EINVAL;
4904         }
4905
4906         for (i = 0; i <= RTA_MAX; i++) {
4907                 if (!tb[i])
4908                         continue;
4909
4910                 switch (i) {
4911                 case RTA_SRC:
4912                 case RTA_DST:
4913                 case RTA_IIF:
4914                 case RTA_OIF:
4915                 case RTA_MARK:
4916                 case RTA_UID:
4917                 case RTA_SPORT:
4918                 case RTA_DPORT:
4919                 case RTA_IP_PROTO:
4920                         break;
4921                 default:
4922                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4923                         return -EINVAL;
4924                 }
4925         }
4926
4927         return 0;
4928 }
4929
4930 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4931                               struct netlink_ext_ack *extack)
4932 {
4933         struct net *net = sock_net(in_skb->sk);
4934         struct nlattr *tb[RTA_MAX+1];
4935         int err, iif = 0, oif = 0;
4936         struct fib6_info *from;
4937         struct dst_entry *dst;
4938         struct rt6_info *rt;
4939         struct sk_buff *skb;
4940         struct rtmsg *rtm;
4941         struct flowi6 fl6 = {};
4942         bool fibmatch;
4943
4944         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4945         if (err < 0)
4946                 goto errout;
4947
4948         err = -EINVAL;
4949         rtm = nlmsg_data(nlh);
4950         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4951         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4952
4953         if (tb[RTA_SRC]) {
4954                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4955                         goto errout;
4956
4957                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4958         }
4959
4960         if (tb[RTA_DST]) {
4961                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4962                         goto errout;
4963
4964                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4965         }
4966
4967         if (tb[RTA_IIF])
4968                 iif = nla_get_u32(tb[RTA_IIF]);
4969
4970         if (tb[RTA_OIF])
4971                 oif = nla_get_u32(tb[RTA_OIF]);
4972
4973         if (tb[RTA_MARK])
4974                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4975
4976         if (tb[RTA_UID])
4977                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4978                                            nla_get_u32(tb[RTA_UID]));
4979         else
4980                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4981
4982         if (tb[RTA_SPORT])
4983                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4984
4985         if (tb[RTA_DPORT])
4986                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4987
4988         if (tb[RTA_IP_PROTO]) {
4989                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4990                                                   &fl6.flowi6_proto, AF_INET6,
4991                                                   extack);
4992                 if (err)
4993                         goto errout;
4994         }
4995
4996         if (iif) {
4997                 struct net_device *dev;
4998                 int flags = 0;
4999
5000                 rcu_read_lock();
5001
5002                 dev = dev_get_by_index_rcu(net, iif);
5003                 if (!dev) {
5004                         rcu_read_unlock();
5005                         err = -ENODEV;
5006                         goto errout;
5007                 }
5008
5009                 fl6.flowi6_iif = iif;
5010
5011                 if (!ipv6_addr_any(&fl6.saddr))
5012                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5013
5014                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5015
5016                 rcu_read_unlock();
5017         } else {
5018                 fl6.flowi6_oif = oif;
5019
5020                 dst = ip6_route_output(net, NULL, &fl6);
5021         }
5022
5023
5024         rt = container_of(dst, struct rt6_info, dst);
5025         if (rt->dst.error) {
5026                 err = rt->dst.error;
5027                 ip6_rt_put(rt);
5028                 goto errout;
5029         }
5030
5031         if (rt == net->ipv6.ip6_null_entry) {
5032                 err = rt->dst.error;
5033                 ip6_rt_put(rt);
5034                 goto errout;
5035         }
5036
5037         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5038         if (!skb) {
5039                 ip6_rt_put(rt);
5040                 err = -ENOBUFS;
5041                 goto errout;
5042         }
5043
5044         skb_dst_set(skb, &rt->dst);
5045
5046         rcu_read_lock();
5047         from = rcu_dereference(rt->from);
5048
5049         if (fibmatch)
5050                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5051                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5052                                     nlh->nlmsg_seq, 0);
5053         else
5054                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5055                                     &fl6.saddr, iif, RTM_NEWROUTE,
5056                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5057                                     0);
5058         rcu_read_unlock();
5059
5060         if (err < 0) {
5061                 kfree_skb(skb);
5062                 goto errout;
5063         }
5064
5065         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5066 errout:
5067         return err;
5068 }
5069
5070 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5071                      unsigned int nlm_flags)
5072 {
5073         struct sk_buff *skb;
5074         struct net *net = info->nl_net;
5075         u32 seq;
5076         int err;
5077
5078         err = -ENOBUFS;
5079         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5080
5081         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5082         if (!skb)
5083                 goto errout;
5084
5085         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5086                             event, info->portid, seq, nlm_flags);
5087         if (err < 0) {
5088                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5089                 WARN_ON(err == -EMSGSIZE);
5090                 kfree_skb(skb);
5091                 goto errout;
5092         }
5093         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5094                     info->nlh, gfp_any());
5095         return;
5096 errout:
5097         if (err < 0)
5098                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5099 }
5100
5101 static int ip6_route_dev_notify(struct notifier_block *this,
5102                                 unsigned long event, void *ptr)
5103 {
5104         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5105         struct net *net = dev_net(dev);
5106
5107         if (!(dev->flags & IFF_LOOPBACK))
5108                 return NOTIFY_OK;
5109
5110         if (event == NETDEV_REGISTER) {
5111                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5112                 net->ipv6.ip6_null_entry->dst.dev = dev;
5113                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5115                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5116                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5117                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5118                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5119 #endif
5120          } else if (event == NETDEV_UNREGISTER &&
5121                     dev->reg_state != NETREG_UNREGISTERED) {
5122                 /* NETDEV_UNREGISTER could be fired for multiple times by
5123                  * netdev_wait_allrefs(). Make sure we only call this once.
5124                  */
5125                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5126 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5127                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5128                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5129 #endif
5130         }
5131
5132         return NOTIFY_OK;
5133 }
5134
5135 /*
5136  *      /proc
5137  */
5138
5139 #ifdef CONFIG_PROC_FS
5140 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5141 {
5142         struct net *net = (struct net *)seq->private;
5143         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5144                    net->ipv6.rt6_stats->fib_nodes,
5145                    net->ipv6.rt6_stats->fib_route_nodes,
5146                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5147                    net->ipv6.rt6_stats->fib_rt_entries,
5148                    net->ipv6.rt6_stats->fib_rt_cache,
5149                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5150                    net->ipv6.rt6_stats->fib_discarded_routes);
5151
5152         return 0;
5153 }
5154 #endif  /* CONFIG_PROC_FS */
5155
5156 #ifdef CONFIG_SYSCTL
5157
5158 static
5159 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5160                               void __user *buffer, size_t *lenp, loff_t *ppos)
5161 {
5162         struct net *net;
5163         int delay;
5164         int ret;
5165         if (!write)
5166                 return -EINVAL;
5167
5168         net = (struct net *)ctl->extra1;
5169         delay = net->ipv6.sysctl.flush_delay;
5170         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5171         if (ret)
5172                 return ret;
5173
5174         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5175         return 0;
5176 }
5177
5178 static int zero;
5179 static int one = 1;
5180
5181 static struct ctl_table ipv6_route_table_template[] = {
5182         {
5183                 .procname       =       "flush",
5184                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5185                 .maxlen         =       sizeof(int),
5186                 .mode           =       0200,
5187                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5188         },
5189         {
5190                 .procname       =       "gc_thresh",
5191                 .data           =       &ip6_dst_ops_template.gc_thresh,
5192                 .maxlen         =       sizeof(int),
5193                 .mode           =       0644,
5194                 .proc_handler   =       proc_dointvec,
5195         },
5196         {
5197                 .procname       =       "max_size",
5198                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5199                 .maxlen         =       sizeof(int),
5200                 .mode           =       0644,
5201                 .proc_handler   =       proc_dointvec,
5202         },
5203         {
5204                 .procname       =       "gc_min_interval",
5205                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5206                 .maxlen         =       sizeof(int),
5207                 .mode           =       0644,
5208                 .proc_handler   =       proc_dointvec_jiffies,
5209         },
5210         {
5211                 .procname       =       "gc_timeout",
5212                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5213                 .maxlen         =       sizeof(int),
5214                 .mode           =       0644,
5215                 .proc_handler   =       proc_dointvec_jiffies,
5216         },
5217         {
5218                 .procname       =       "gc_interval",
5219                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5220                 .maxlen         =       sizeof(int),
5221                 .mode           =       0644,
5222                 .proc_handler   =       proc_dointvec_jiffies,
5223         },
5224         {
5225                 .procname       =       "gc_elasticity",
5226                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5227                 .maxlen         =       sizeof(int),
5228                 .mode           =       0644,
5229                 .proc_handler   =       proc_dointvec,
5230         },
5231         {
5232                 .procname       =       "mtu_expires",
5233                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5234                 .maxlen         =       sizeof(int),
5235                 .mode           =       0644,
5236                 .proc_handler   =       proc_dointvec_jiffies,
5237         },
5238         {
5239                 .procname       =       "min_adv_mss",
5240                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5241                 .maxlen         =       sizeof(int),
5242                 .mode           =       0644,
5243                 .proc_handler   =       proc_dointvec,
5244         },
5245         {
5246                 .procname       =       "gc_min_interval_ms",
5247                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5248                 .maxlen         =       sizeof(int),
5249                 .mode           =       0644,
5250                 .proc_handler   =       proc_dointvec_ms_jiffies,
5251         },
5252         {
5253                 .procname       =       "skip_notify_on_dev_down",
5254                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5255                 .maxlen         =       sizeof(int),
5256                 .mode           =       0644,
5257                 .proc_handler   =       proc_dointvec,
5258                 .extra1         =       &zero,
5259                 .extra2         =       &one,
5260         },
5261         { }
5262 };
5263
5264 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5265 {
5266         struct ctl_table *table;
5267
5268         table = kmemdup(ipv6_route_table_template,
5269                         sizeof(ipv6_route_table_template),
5270                         GFP_KERNEL);
5271
5272         if (table) {
5273                 table[0].data = &net->ipv6.sysctl.flush_delay;
5274                 table[0].extra1 = net;
5275                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5276                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5277                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5278                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5279                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5280                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5281                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5282                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5283                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5284                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5285
5286                 /* Don't export sysctls to unprivileged users */
5287                 if (net->user_ns != &init_user_ns)
5288                         table[0].procname = NULL;
5289         }
5290
5291         return table;
5292 }
5293 #endif
5294
5295 static int __net_init ip6_route_net_init(struct net *net)
5296 {
5297         int ret = -ENOMEM;
5298
5299         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5300                sizeof(net->ipv6.ip6_dst_ops));
5301
5302         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5303                 goto out_ip6_dst_ops;
5304
5305         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5306                                             sizeof(*net->ipv6.fib6_null_entry),
5307                                             GFP_KERNEL);
5308         if (!net->ipv6.fib6_null_entry)
5309                 goto out_ip6_dst_entries;
5310
5311         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5312                                            sizeof(*net->ipv6.ip6_null_entry),
5313                                            GFP_KERNEL);
5314         if (!net->ipv6.ip6_null_entry)
5315                 goto out_fib6_null_entry;
5316         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5317         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5318                          ip6_template_metrics, true);
5319
5320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5321         net->ipv6.fib6_has_custom_rules = false;
5322         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5323                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5324                                                GFP_KERNEL);
5325         if (!net->ipv6.ip6_prohibit_entry)
5326                 goto out_ip6_null_entry;
5327         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5328         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5329                          ip6_template_metrics, true);
5330
5331         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5332                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5333                                                GFP_KERNEL);
5334         if (!net->ipv6.ip6_blk_hole_entry)
5335                 goto out_ip6_prohibit_entry;
5336         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5337         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5338                          ip6_template_metrics, true);
5339 #endif
5340
5341         net->ipv6.sysctl.flush_delay = 0;
5342         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5343         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5344         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5345         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5346         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5347         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5348         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5349         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5350
5351         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5352
5353         ret = 0;
5354 out:
5355         return ret;
5356
5357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5358 out_ip6_prohibit_entry:
5359         kfree(net->ipv6.ip6_prohibit_entry);
5360 out_ip6_null_entry:
5361         kfree(net->ipv6.ip6_null_entry);
5362 #endif
5363 out_fib6_null_entry:
5364         kfree(net->ipv6.fib6_null_entry);
5365 out_ip6_dst_entries:
5366         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5367 out_ip6_dst_ops:
5368         goto out;
5369 }
5370
5371 static void __net_exit ip6_route_net_exit(struct net *net)
5372 {
5373         kfree(net->ipv6.fib6_null_entry);
5374         kfree(net->ipv6.ip6_null_entry);
5375 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5376         kfree(net->ipv6.ip6_prohibit_entry);
5377         kfree(net->ipv6.ip6_blk_hole_entry);
5378 #endif
5379         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5380 }
5381
5382 static int __net_init ip6_route_net_init_late(struct net *net)
5383 {
5384 #ifdef CONFIG_PROC_FS
5385         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5386                         sizeof(struct ipv6_route_iter));
5387         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5388                         rt6_stats_seq_show, NULL);
5389 #endif
5390         return 0;
5391 }
5392
5393 static void __net_exit ip6_route_net_exit_late(struct net *net)
5394 {
5395 #ifdef CONFIG_PROC_FS
5396         remove_proc_entry("ipv6_route", net->proc_net);
5397         remove_proc_entry("rt6_stats", net->proc_net);
5398 #endif
5399 }
5400
5401 static struct pernet_operations ip6_route_net_ops = {
5402         .init = ip6_route_net_init,
5403         .exit = ip6_route_net_exit,
5404 };
5405
5406 static int __net_init ipv6_inetpeer_init(struct net *net)
5407 {
5408         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5409
5410         if (!bp)
5411                 return -ENOMEM;
5412         inet_peer_base_init(bp);
5413         net->ipv6.peers = bp;
5414         return 0;
5415 }
5416
5417 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5418 {
5419         struct inet_peer_base *bp = net->ipv6.peers;
5420
5421         net->ipv6.peers = NULL;
5422         inetpeer_invalidate_tree(bp);
5423         kfree(bp);
5424 }
5425
5426 static struct pernet_operations ipv6_inetpeer_ops = {
5427         .init   =       ipv6_inetpeer_init,
5428         .exit   =       ipv6_inetpeer_exit,
5429 };
5430
5431 static struct pernet_operations ip6_route_net_late_ops = {
5432         .init = ip6_route_net_init_late,
5433         .exit = ip6_route_net_exit_late,
5434 };
5435
5436 static struct notifier_block ip6_route_dev_notifier = {
5437         .notifier_call = ip6_route_dev_notify,
5438         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5439 };
5440
5441 void __init ip6_route_init_special_entries(void)
5442 {
5443         /* Registering of the loopback is done before this portion of code,
5444          * the loopback reference in rt6_info will not be taken, do it
5445          * manually for init_net */
5446         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5447         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5448         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5449   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5450         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5451         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5452         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5453         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5454   #endif
5455 }
5456
5457 int __init ip6_route_init(void)
5458 {
5459         int ret;
5460         int cpu;
5461
5462         ret = -ENOMEM;
5463         ip6_dst_ops_template.kmem_cachep =
5464                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5465                                   SLAB_HWCACHE_ALIGN, NULL);
5466         if (!ip6_dst_ops_template.kmem_cachep)
5467                 goto out;
5468
5469         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5470         if (ret)
5471                 goto out_kmem_cache;
5472
5473         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5474         if (ret)
5475                 goto out_dst_entries;
5476
5477         ret = register_pernet_subsys(&ip6_route_net_ops);
5478         if (ret)
5479                 goto out_register_inetpeer;
5480
5481         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5482
5483         ret = fib6_init();
5484         if (ret)
5485                 goto out_register_subsys;
5486
5487         ret = xfrm6_init();
5488         if (ret)
5489                 goto out_fib6_init;
5490
5491         ret = fib6_rules_init();
5492         if (ret)
5493                 goto xfrm6_init;
5494
5495         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5496         if (ret)
5497                 goto fib6_rules_init;
5498
5499         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5500                                    inet6_rtm_newroute, NULL, 0);
5501         if (ret < 0)
5502                 goto out_register_late_subsys;
5503
5504         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5505                                    inet6_rtm_delroute, NULL, 0);
5506         if (ret < 0)
5507                 goto out_register_late_subsys;
5508
5509         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5510                                    inet6_rtm_getroute, NULL,
5511                                    RTNL_FLAG_DOIT_UNLOCKED);
5512         if (ret < 0)
5513                 goto out_register_late_subsys;
5514
5515         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5516         if (ret)
5517                 goto out_register_late_subsys;
5518
5519         for_each_possible_cpu(cpu) {
5520                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5521
5522                 INIT_LIST_HEAD(&ul->head);
5523                 spin_lock_init(&ul->lock);
5524         }
5525
5526 out:
5527         return ret;
5528
5529 out_register_late_subsys:
5530         rtnl_unregister_all(PF_INET6);
5531         unregister_pernet_subsys(&ip6_route_net_late_ops);
5532 fib6_rules_init:
5533         fib6_rules_cleanup();
5534 xfrm6_init:
5535         xfrm6_fini();
5536 out_fib6_init:
5537         fib6_gc_cleanup();
5538 out_register_subsys:
5539         unregister_pernet_subsys(&ip6_route_net_ops);
5540 out_register_inetpeer:
5541         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5542 out_dst_entries:
5543         dst_entries_destroy(&ip6_dst_blackhole_ops);
5544 out_kmem_cache:
5545         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5546         goto out;
5547 }
5548
5549 void ip6_route_cleanup(void)
5550 {
5551         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5552         unregister_pernet_subsys(&ip6_route_net_late_ops);
5553         fib6_rules_cleanup();
5554         xfrm6_fini();
5555         fib6_gc_cleanup();
5556         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5557         unregister_pernet_subsys(&ip6_route_net_ops);
5558         dst_entries_destroy(&ip6_dst_blackhole_ops);
5559         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5560 }