]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: Fix redirect with VRF
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114                                            const struct in6_addr *daddr,
115                                            const struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = REFCOUNT_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         from = xchg((__force struct fib6_info **)&rt->from, NULL);
384         fib6_info_release(from);
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429                       struct flowi6 *fl6, int oif, bool have_oif_match,
430                       const struct sk_buff *skb, int strict)
431 {
432         struct fib6_info *sibling, *next_sibling;
433         struct fib6_info *match = res->f6i;
434
435         if (!match->fib6_nsiblings || have_oif_match)
436                 goto out;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445                 goto out;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 const struct fib6_nh *nh = &sibling->fib6_nh;
450                 int nh_upper_bound;
451
452                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453                 if (fl6->mp_hash > nh_upper_bound)
454                         continue;
455                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456                         break;
457                 match = sibling;
458                 break;
459         }
460
461 out:
462         res->f6i = match;
463         res->nh = &match->fib6_nh;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471                                const struct in6_addr *saddr, int oif, int flags)
472 {
473         const struct net_device *dev;
474
475         if (nh->fib_nh_flags & RTNH_F_DEAD)
476                 return false;
477
478         dev = nh->fib_nh_dev;
479         if (oif) {
480                 if (dev->ifindex == oif)
481                         return true;
482         } else {
483                 if (ipv6_chk_addr(net, saddr, dev,
484                                   flags & RT6_LOOKUP_F_IFACE))
485                         return true;
486         }
487
488         return false;
489 }
490
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492                              const struct in6_addr *saddr, int oif, int flags)
493 {
494         struct fib6_info *f6i = res->f6i;
495         struct fib6_info *spf6i;
496         struct fib6_nh *nh;
497
498         if (!oif && ipv6_addr_any(saddr)) {
499                 nh = &f6i->fib6_nh;
500                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501                         goto out;
502         }
503
504         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505                 nh = &spf6i->fib6_nh;
506                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507                         res->f6i = spf6i;
508                         goto out;
509                 }
510         }
511
512         if (oif && flags & RT6_LOOKUP_F_IFACE) {
513                 res->f6i = net->ipv6.fib6_null_entry;
514                 nh = &res->f6i->fib6_nh;
515                 goto out;
516         }
517
518         nh = &f6i->fib6_nh;
519         if (nh->fib_nh_flags & RTNH_F_DEAD) {
520                 res->f6i = net->ipv6.fib6_null_entry;
521                 nh = &res->f6i->fib6_nh;
522         }
523 out:
524         res->nh = nh;
525         res->fib6_type = res->f6i->fib6_type;
526         res->fib6_flags = res->f6i->fib6_flags;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550         struct __rt6_probe_work *work = NULL;
551         const struct in6_addr *nh_gw;
552         struct neighbour *neigh;
553         struct net_device *dev;
554         struct inet6_dev *idev;
555
556         /*
557          * Okay, this does not seem to be appropriate
558          * for now, however, we need to check if it
559          * is really so; aka Router Reachability Probing.
560          *
561          * Router Reachability Probe MUST be rate-limited
562          * to no more than one per minute.
563          */
564         if (fib6_nh->fib_nh_gw_family)
565                 return;
566
567         nh_gw = &fib6_nh->fib_nh_gw6;
568         dev = fib6_nh->fib_nh_dev;
569         rcu_read_lock_bh();
570         idev = __in6_dev_get(dev);
571         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 write_lock(&neigh->lock);
577                 if (!(neigh->nud_state & NUD_VALID) &&
578                     time_after(jiffies,
579                                neigh->updated + idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else if (time_after(jiffies, fib6_nh->last_probe +
586                                        idev->cnf.rtr_probe_interval)) {
587                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588         }
589
590         if (work) {
591                 fib6_nh->last_probe = jiffies;
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = *nh_gw;
594                 dev_hold(dev);
595                 work->dev = dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614         struct neighbour *neigh;
615
616         rcu_read_lock_bh();
617         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618                                           &fib6_nh->fib_nh_gw6);
619         if (neigh) {
620                 read_lock(&neigh->lock);
621                 if (neigh->nud_state & NUD_VALID)
622                         ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624                 else if (!(neigh->nud_state & NUD_FAILED))
625                         ret = RT6_NUD_SUCCEED;
626                 else
627                         ret = RT6_NUD_FAIL_PROBE;
628 #endif
629                 read_unlock(&neigh->lock);
630         } else {
631                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633         }
634         rcu_read_unlock_bh();
635
636         return ret;
637 }
638
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640                            int strict)
641 {
642         int m = 0;
643
644         if (!oif || nh->fib_nh_dev->ifindex == oif)
645                 m = 2;
646
647         if (!m && (strict & RT6_LOOKUP_F_IFACE))
648                 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654                 int n = rt6_check_neigh(nh);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662                        int oif, int strict, int *mpri, bool *do_rr)
663 {
664         bool match_do_rr = false;
665         bool rc = false;
666         int m;
667
668         if (nh->fib_nh_flags & RTNH_F_DEAD)
669                 goto out;
670
671         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674                 goto out;
675
676         m = rt6_score_route(nh, fib6_flags, oif, strict);
677         if (m == RT6_NUD_FAIL_DO_RR) {
678                 match_do_rr = true;
679                 m = 0; /* lowest valid score */
680         } else if (m == RT6_NUD_FAIL_HARD) {
681                 goto out;
682         }
683
684         if (strict & RT6_LOOKUP_F_REACHABLE)
685                 rt6_probe(nh);
686
687         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688         if (m > *mpri) {
689                 *do_rr = match_do_rr;
690                 *mpri = m;
691                 rc = true;
692         }
693 out:
694         return rc;
695 }
696
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698                            struct fib6_info *nomatch, u32 metric,
699                            struct fib6_result *res, struct fib6_info **cont,
700                            int oif, int strict, bool *do_rr, int *mpri)
701 {
702         struct fib6_info *f6i;
703
704         for (f6i = f6i_start;
705              f6i && f6i != nomatch;
706              f6i = rcu_dereference(f6i->fib6_next)) {
707                 struct fib6_nh *nh;
708
709                 if (cont && f6i->fib6_metric != metric) {
710                         *cont = f6i;
711                         return;
712                 }
713
714                 if (fib6_check_expired(f6i))
715                         continue;
716
717                 nh = &f6i->fib6_nh;
718                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719                         res->f6i = f6i;
720                         res->nh = nh;
721                         res->fib6_flags = f6i->fib6_flags;
722                         res->fib6_type = f6i->fib6_type;
723                 }
724         }
725 }
726
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728                          struct fib6_info *rr_head, int oif, int strict,
729                          bool *do_rr, struct fib6_result *res)
730 {
731         u32 metric = rr_head->fib6_metric;
732         struct fib6_info *cont = NULL;
733         int mpri = -1;
734
735         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739                        oif, strict, do_rr, &mpri);
740
741         if (res->f6i || !cont)
742                 return;
743
744         __find_rr_leaf(cont, NULL, metric, res, NULL,
745                        oif, strict, do_rr, &mpri);
746 }
747
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749                        struct fib6_result *res, int strict)
750 {
751         struct fib6_info *leaf = rcu_dereference(fn->leaf);
752         struct fib6_info *rt0;
753         bool do_rr = false;
754         int key_plen;
755
756         /* make sure this function or its helpers sets f6i */
757         res->f6i = NULL;
758
759         if (!leaf || leaf == net->ipv6.fib6_null_entry)
760                 goto out;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->fib6_src.plen)
774                 key_plen = rt0->fib6_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 goto out;
778
779         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780         if (do_rr) {
781                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->fib6_metric != rt0->fib6_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->fib6_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793                 }
794         }
795
796 out:
797         if (!res->f6i) {
798                 res->f6i = net->ipv6.fib6_null_entry;
799                 res->nh = &res->f6i->fib6_nh;
800                 res->fib6_flags = res->f6i->fib6_flags;
801                 res->fib6_type = res->f6i->fib6_type;
802         }
803 }
804
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808                res->nh->fib_nh_gw_family;
809 }
810
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813                   const struct in6_addr *gwaddr)
814 {
815         struct net *net = dev_net(dev);
816         struct route_info *rinfo = (struct route_info *) opt;
817         struct in6_addr prefix_buf, *prefix;
818         unsigned int pref;
819         unsigned long lifetime;
820         struct fib6_info *rt;
821
822         if (len < sizeof(struct route_info)) {
823                 return -EINVAL;
824         }
825
826         /* Sanity check for prefix_len and length */
827         if (rinfo->length > 3) {
828                 return -EINVAL;
829         } else if (rinfo->prefix_len > 128) {
830                 return -EINVAL;
831         } else if (rinfo->prefix_len > 64) {
832                 if (rinfo->length < 2) {
833                         return -EINVAL;
834                 }
835         } else if (rinfo->prefix_len > 0) {
836                 if (rinfo->length < 1) {
837                         return -EINVAL;
838                 }
839         }
840
841         pref = rinfo->route_pref;
842         if (pref == ICMPV6_ROUTER_PREF_INVALID)
843                 return -EINVAL;
844
845         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846
847         if (rinfo->length == 3)
848                 prefix = (struct in6_addr *)rinfo->prefix;
849         else {
850                 /* this function is safe */
851                 ipv6_addr_prefix(&prefix_buf,
852                                  (struct in6_addr *)rinfo->prefix,
853                                  rinfo->prefix_len);
854                 prefix = &prefix_buf;
855         }
856
857         if (rinfo->prefix_len == 0)
858                 rt = rt6_get_dflt_router(net, gwaddr, dev);
859         else
860                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861                                         gwaddr, dev);
862
863         if (rt && !lifetime) {
864                 ip6_del_rt(net, rt);
865                 rt = NULL;
866         }
867
868         if (!rt && lifetime)
869                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870                                         dev, pref);
871         else if (rt)
872                 rt->fib6_flags = RTF_ROUTEINFO |
873                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874
875         if (rt) {
876                 if (!addrconf_finite_timeout(lifetime))
877                         fib6_clean_expires(rt);
878                 else
879                         fib6_set_expires(rt, jiffies + HZ * lifetime);
880
881                 fib6_info_release(rt);
882         }
883         return 0;
884 }
885 #endif
886
887 /*
888  *      Misc support functions
889  */
890
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894         struct net_device *dev = res->nh->fib_nh_dev;
895
896         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897                 /* for copies of local routes, dst->dev needs to be the
898                  * device if it is a master device, the master device if
899                  * device is enslaved, and the loopback as the default
900                  */
901                 if (netif_is_l3_slave(dev) &&
902                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
903                         dev = l3mdev_master_dev_rcu(dev);
904                 else if (!netif_is_l3_master(dev))
905                         dev = dev_net(dev)->loopback_dev;
906                 /* last case is netif_is_l3_master(dev) is true in which
907                  * case we want dev returned to be dev
908                  */
909         }
910
911         return dev;
912 }
913
914 static const int fib6_prop[RTN_MAX + 1] = {
915         [RTN_UNSPEC]    = 0,
916         [RTN_UNICAST]   = 0,
917         [RTN_LOCAL]     = 0,
918         [RTN_BROADCAST] = 0,
919         [RTN_ANYCAST]   = 0,
920         [RTN_MULTICAST] = 0,
921         [RTN_BLACKHOLE] = -EINVAL,
922         [RTN_UNREACHABLE] = -EHOSTUNREACH,
923         [RTN_PROHIBIT]  = -EACCES,
924         [RTN_THROW]     = -EAGAIN,
925         [RTN_NAT]       = -EINVAL,
926         [RTN_XRESOLVE]  = -EINVAL,
927 };
928
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931         return fib6_prop[fib6_type];
932 }
933
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936         unsigned short flags = 0;
937
938         if (rt->dst_nocount)
939                 flags |= DST_NOCOUNT;
940         if (rt->dst_nopolicy)
941                 flags |= DST_NOPOLICY;
942         if (rt->dst_host)
943                 flags |= DST_HOST;
944
945         return flags;
946 }
947
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950         rt->dst.error = ip6_rt_type_to_error(fib6_type);
951
952         switch (fib6_type) {
953         case RTN_BLACKHOLE:
954                 rt->dst.output = dst_discard_out;
955                 rt->dst.input = dst_discard;
956                 break;
957         case RTN_PROHIBIT:
958                 rt->dst.output = ip6_pkt_prohibit_out;
959                 rt->dst.input = ip6_pkt_prohibit;
960                 break;
961         case RTN_THROW:
962         case RTN_UNREACHABLE:
963         default:
964                 rt->dst.output = ip6_pkt_discard_out;
965                 rt->dst.input = ip6_pkt_discard;
966                 break;
967         }
968 }
969
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972         struct fib6_info *f6i = res->f6i;
973
974         if (res->fib6_flags & RTF_REJECT) {
975                 ip6_rt_init_dst_reject(rt, res->fib6_type);
976                 return;
977         }
978
979         rt->dst.error = 0;
980         rt->dst.output = ip6_output;
981
982         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983                 rt->dst.input = ip6_input;
984         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985                 rt->dst.input = ip6_mc_input;
986         } else {
987                 rt->dst.input = ip6_forward;
988         }
989
990         if (res->nh->fib_nh_lws) {
991                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992                 lwtunnel_set_redirect(&rt->dst);
993         }
994
995         rt->dst.lastuse = jiffies;
996 }
997
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001         rt->rt6i_flags &= ~RTF_EXPIRES;
1002         rcu_assign_pointer(rt->from, from);
1003         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009         const struct fib6_nh *nh = res->nh;
1010         const struct net_device *dev = nh->fib_nh_dev;
1011         struct fib6_info *f6i = res->f6i;
1012
1013         ip6_rt_init_dst(rt, res);
1014
1015         rt->rt6i_dst = f6i->fib6_dst;
1016         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017         rt->rt6i_flags = res->fib6_flags;
1018         if (nh->fib_nh_gw_family) {
1019                 rt->rt6i_gateway = nh->fib_nh_gw6;
1020                 rt->rt6i_flags |= RTF_GATEWAY;
1021         }
1022         rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024         rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029                                         struct in6_addr *saddr)
1030 {
1031         struct fib6_node *pn, *sn;
1032         while (1) {
1033                 if (fn->fn_flags & RTN_TL_ROOT)
1034                         return NULL;
1035                 pn = rcu_dereference(fn->parent);
1036                 sn = FIB6_SUBTREE(pn);
1037                 if (sn && sn != fn)
1038                         fn = fib6_node_lookup(sn, NULL, saddr);
1039                 else
1040                         fn = pn;
1041                 if (fn->fn_flags & RTN_RTINFO)
1042                         return fn;
1043         }
1044 }
1045
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048         struct rt6_info *rt = *prt;
1049
1050         if (dst_hold_safe(&rt->dst))
1051                 return true;
1052         if (net) {
1053                 rt = net->ipv6.ip6_null_entry;
1054                 dst_hold(&rt->dst);
1055         } else {
1056                 rt = NULL;
1057         }
1058         *prt = rt;
1059         return false;
1060 }
1061
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065         struct net_device *dev = res->nh->fib_nh_dev;
1066         struct fib6_info *f6i = res->f6i;
1067         unsigned short flags;
1068         struct rt6_info *nrt;
1069
1070         if (!fib6_info_hold_safe(f6i))
1071                 goto fallback;
1072
1073         flags = fib6_info_dst_flags(f6i);
1074         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075         if (!nrt) {
1076                 fib6_info_release(f6i);
1077                 goto fallback;
1078         }
1079
1080         ip6_rt_copy_init(nrt, res);
1081         return nrt;
1082
1083 fallback:
1084         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085         dst_hold(&nrt->dst);
1086         return nrt;
1087 }
1088
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090                                              struct fib6_table *table,
1091                                              struct flowi6 *fl6,
1092                                              const struct sk_buff *skb,
1093                                              int flags)
1094 {
1095         struct fib6_result res = {};
1096         struct fib6_node *fn;
1097         struct rt6_info *rt;
1098
1099         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100                 flags &= ~RT6_LOOKUP_F_IFACE;
1101
1102         rcu_read_lock();
1103         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105         res.f6i = rcu_dereference(fn->leaf);
1106         if (!res.f6i)
1107                 res.f6i = net->ipv6.fib6_null_entry;
1108         else
1109                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110                                  flags);
1111
1112         if (res.f6i == net->ipv6.fib6_null_entry) {
1113                 fn = fib6_backtrack(fn, &fl6->saddr);
1114                 if (fn)
1115                         goto restart;
1116
1117                 rt = net->ipv6.ip6_null_entry;
1118                 dst_hold(&rt->dst);
1119                 goto out;
1120         }
1121
1122         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123                          fl6->flowi6_oif != 0, skb, flags);
1124
1125         /* Search through exception table */
1126         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127         if (rt) {
1128                 if (ip6_hold_safe(net, &rt))
1129                         dst_use_noref(&rt->dst, jiffies);
1130         } else {
1131                 rt = ip6_create_rt_rcu(&res);
1132         }
1133
1134 out:
1135         trace_fib6_table_lookup(net, &res, table, fl6);
1136
1137         rcu_read_unlock();
1138
1139         return rt;
1140 }
1141
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143                                    const struct sk_buff *skb, int flags)
1144 {
1145         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150                             const struct in6_addr *saddr, int oif,
1151                             const struct sk_buff *skb, int strict)
1152 {
1153         struct flowi6 fl6 = {
1154                 .flowi6_oif = oif,
1155                 .daddr = *daddr,
1156         };
1157         struct dst_entry *dst;
1158         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160         if (saddr) {
1161                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163         }
1164
1165         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166         if (dst->error == 0)
1167                 return (struct rt6_info *) dst;
1168
1169         dst_release(dst);
1170
1171         return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182                         struct netlink_ext_ack *extack)
1183 {
1184         int err;
1185         struct fib6_table *table;
1186
1187         table = rt->fib6_table;
1188         spin_lock_bh(&table->tb6_lock);
1189         err = fib6_add(&table->tb6_root, rt, info, extack);
1190         spin_unlock_bh(&table->tb6_lock);
1191
1192         return err;
1193 }
1194
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197         struct nl_info info = { .nl_net = net, };
1198
1199         return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203                                            const struct in6_addr *daddr,
1204                                            const struct in6_addr *saddr)
1205 {
1206         struct fib6_info *f6i = res->f6i;
1207         struct net_device *dev;
1208         struct rt6_info *rt;
1209
1210         /*
1211          *      Clone the route.
1212          */
1213
1214         if (!fib6_info_hold_safe(f6i))
1215                 return NULL;
1216
1217         dev = ip6_rt_get_dev_rcu(res);
1218         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219         if (!rt) {
1220                 fib6_info_release(f6i);
1221                 return NULL;
1222         }
1223
1224         ip6_rt_copy_init(rt, res);
1225         rt->rt6i_flags |= RTF_CACHE;
1226         rt->dst.flags |= DST_HOST;
1227         rt->rt6i_dst.addr = *daddr;
1228         rt->rt6i_dst.plen = 128;
1229
1230         if (!rt6_is_gw_or_nonexthop(res)) {
1231                 if (f6i->fib6_dst.plen != 128 &&
1232                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233                         rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235                 if (rt->rt6i_src.plen && saddr) {
1236                         rt->rt6i_src.addr = *saddr;
1237                         rt->rt6i_src.plen = 128;
1238                 }
1239 #endif
1240         }
1241
1242         return rt;
1243 }
1244
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247         struct fib6_info *f6i = res->f6i;
1248         unsigned short flags = fib6_info_dst_flags(f6i);
1249         struct net_device *dev;
1250         struct rt6_info *pcpu_rt;
1251
1252         if (!fib6_info_hold_safe(f6i))
1253                 return NULL;
1254
1255         rcu_read_lock();
1256         dev = ip6_rt_get_dev_rcu(res);
1257         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258         rcu_read_unlock();
1259         if (!pcpu_rt) {
1260                 fib6_info_release(f6i);
1261                 return NULL;
1262         }
1263         ip6_rt_copy_init(pcpu_rt, res);
1264         pcpu_rt->rt6i_flags |= RTF_PCPU;
1265         return pcpu_rt;
1266 }
1267
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271         struct rt6_info *pcpu_rt, **p;
1272
1273         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274         pcpu_rt = *p;
1275
1276         if (pcpu_rt)
1277                 ip6_hold_safe(NULL, &pcpu_rt);
1278
1279         return pcpu_rt;
1280 }
1281
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283                                             const struct fib6_result *res)
1284 {
1285         struct rt6_info *pcpu_rt, *prev, **p;
1286
1287         pcpu_rt = ip6_rt_pcpu_alloc(res);
1288         if (!pcpu_rt) {
1289                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290                 return net->ipv6.ip6_null_entry;
1291         }
1292
1293         dst_hold(&pcpu_rt->dst);
1294         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295         prev = cmpxchg(p, NULL, pcpu_rt);
1296         BUG_ON(prev);
1297
1298         if (res->f6i->fib6_destroying) {
1299                 struct fib6_info *from;
1300
1301                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1302                 fib6_info_release(from);
1303         }
1304
1305         return pcpu_rt;
1306 }
1307
1308 /* exception hash table implementation
1309  */
1310 static DEFINE_SPINLOCK(rt6_exception_lock);
1311
1312 /* Remove rt6_ex from hash table and free the memory
1313  * Caller must hold rt6_exception_lock
1314  */
1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1316                                  struct rt6_exception *rt6_ex)
1317 {
1318         struct fib6_info *from;
1319         struct net *net;
1320
1321         if (!bucket || !rt6_ex)
1322                 return;
1323
1324         net = dev_net(rt6_ex->rt6i->dst.dev);
1325         net->ipv6.rt6_stats->fib_rt_cache--;
1326
1327         /* purge completely the exception to allow releasing the held resources:
1328          * some [sk] cache may keep the dst around for unlimited time
1329          */
1330         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1331         fib6_info_release(from);
1332         dst_dev_put(&rt6_ex->rt6i->dst);
1333
1334         hlist_del_rcu(&rt6_ex->hlist);
1335         dst_release(&rt6_ex->rt6i->dst);
1336         kfree_rcu(rt6_ex, rcu);
1337         WARN_ON_ONCE(!bucket->depth);
1338         bucket->depth--;
1339 }
1340
1341 /* Remove oldest rt6_ex in bucket and free the memory
1342  * Caller must hold rt6_exception_lock
1343  */
1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1345 {
1346         struct rt6_exception *rt6_ex, *oldest = NULL;
1347
1348         if (!bucket)
1349                 return;
1350
1351         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1352                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1353                         oldest = rt6_ex;
1354         }
1355         rt6_remove_exception(bucket, oldest);
1356 }
1357
1358 static u32 rt6_exception_hash(const struct in6_addr *dst,
1359                               const struct in6_addr *src)
1360 {
1361         static u32 seed __read_mostly;
1362         u32 val;
1363
1364         net_get_random_once(&seed, sizeof(seed));
1365         val = jhash(dst, sizeof(*dst), seed);
1366
1367 #ifdef CONFIG_IPV6_SUBTREES
1368         if (src)
1369                 val = jhash(src, sizeof(*src), val);
1370 #endif
1371         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1372 }
1373
1374 /* Helper function to find the cached rt in the hash table
1375  * and update bucket pointer to point to the bucket for this
1376  * (daddr, saddr) pair
1377  * Caller must hold rt6_exception_lock
1378  */
1379 static struct rt6_exception *
1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1381                               const struct in6_addr *daddr,
1382                               const struct in6_addr *saddr)
1383 {
1384         struct rt6_exception *rt6_ex;
1385         u32 hval;
1386
1387         if (!(*bucket) || !daddr)
1388                 return NULL;
1389
1390         hval = rt6_exception_hash(daddr, saddr);
1391         *bucket += hval;
1392
1393         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1394                 struct rt6_info *rt6 = rt6_ex->rt6i;
1395                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398                 if (matched && saddr)
1399                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 #endif
1401                 if (matched)
1402                         return rt6_ex;
1403         }
1404         return NULL;
1405 }
1406
1407 /* Helper function to find the cached rt in the hash table
1408  * and update bucket pointer to point to the bucket for this
1409  * (daddr, saddr) pair
1410  * Caller must hold rcu_read_lock()
1411  */
1412 static struct rt6_exception *
1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1414                          const struct in6_addr *daddr,
1415                          const struct in6_addr *saddr)
1416 {
1417         struct rt6_exception *rt6_ex;
1418         u32 hval;
1419
1420         WARN_ON_ONCE(!rcu_read_lock_held());
1421
1422         if (!(*bucket) || !daddr)
1423                 return NULL;
1424
1425         hval = rt6_exception_hash(daddr, saddr);
1426         *bucket += hval;
1427
1428         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1429                 struct rt6_info *rt6 = rt6_ex->rt6i;
1430                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1431
1432 #ifdef CONFIG_IPV6_SUBTREES
1433                 if (matched && saddr)
1434                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1435 #endif
1436                 if (matched)
1437                         return rt6_ex;
1438         }
1439         return NULL;
1440 }
1441
1442 static unsigned int fib6_mtu(const struct fib6_result *res)
1443 {
1444         const struct fib6_nh *nh = res->nh;
1445         unsigned int mtu;
1446
1447         if (res->f6i->fib6_pmtu) {
1448                 mtu = res->f6i->fib6_pmtu;
1449         } else {
1450                 struct net_device *dev = nh->fib_nh_dev;
1451                 struct inet6_dev *idev;
1452
1453                 rcu_read_lock();
1454                 idev = __in6_dev_get(dev);
1455                 mtu = idev->cnf.mtu6;
1456                 rcu_read_unlock();
1457         }
1458
1459         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1460
1461         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1462 }
1463
1464 static int rt6_insert_exception(struct rt6_info *nrt,
1465                                 const struct fib6_result *res)
1466 {
1467         struct net *net = dev_net(nrt->dst.dev);
1468         struct rt6_exception_bucket *bucket;
1469         struct in6_addr *src_key = NULL;
1470         struct rt6_exception *rt6_ex;
1471         struct fib6_info *f6i = res->f6i;
1472         int err = 0;
1473
1474         spin_lock_bh(&rt6_exception_lock);
1475
1476         if (f6i->exception_bucket_flushed) {
1477                 err = -EINVAL;
1478                 goto out;
1479         }
1480
1481         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1482                                         lockdep_is_held(&rt6_exception_lock));
1483         if (!bucket) {
1484                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1485                                  GFP_ATOMIC);
1486                 if (!bucket) {
1487                         err = -ENOMEM;
1488                         goto out;
1489                 }
1490                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1491         }
1492
1493 #ifdef CONFIG_IPV6_SUBTREES
1494         /* fib6_src.plen != 0 indicates f6i is in subtree
1495          * and exception table is indexed by a hash of
1496          * both fib6_dst and fib6_src.
1497          * Otherwise, the exception table is indexed by
1498          * a hash of only fib6_dst.
1499          */
1500         if (f6i->fib6_src.plen)
1501                 src_key = &nrt->rt6i_src.addr;
1502 #endif
1503         /* rt6_mtu_change() might lower mtu on f6i.
1504          * Only insert this exception route if its mtu
1505          * is less than f6i's mtu value.
1506          */
1507         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1508                 err = -EINVAL;
1509                 goto out;
1510         }
1511
1512         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1513                                                src_key);
1514         if (rt6_ex)
1515                 rt6_remove_exception(bucket, rt6_ex);
1516
1517         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1518         if (!rt6_ex) {
1519                 err = -ENOMEM;
1520                 goto out;
1521         }
1522         rt6_ex->rt6i = nrt;
1523         rt6_ex->stamp = jiffies;
1524         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1525         bucket->depth++;
1526         net->ipv6.rt6_stats->fib_rt_cache++;
1527
1528         if (bucket->depth > FIB6_MAX_DEPTH)
1529                 rt6_exception_remove_oldest(bucket);
1530
1531 out:
1532         spin_unlock_bh(&rt6_exception_lock);
1533
1534         /* Update fn->fn_sernum to invalidate all cached dst */
1535         if (!err) {
1536                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1537                 fib6_update_sernum(net, f6i);
1538                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1539                 fib6_force_start_gc(net);
1540         }
1541
1542         return err;
1543 }
1544
1545 void rt6_flush_exceptions(struct fib6_info *rt)
1546 {
1547         struct rt6_exception_bucket *bucket;
1548         struct rt6_exception *rt6_ex;
1549         struct hlist_node *tmp;
1550         int i;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         /* Prevent rt6_insert_exception() to recreate the bucket list */
1554         rt->exception_bucket_flushed = 1;
1555
1556         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1557                                     lockdep_is_held(&rt6_exception_lock));
1558         if (!bucket)
1559                 goto out;
1560
1561         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1563                         rt6_remove_exception(bucket, rt6_ex);
1564                 WARN_ON_ONCE(bucket->depth);
1565                 bucket++;
1566         }
1567
1568 out:
1569         spin_unlock_bh(&rt6_exception_lock);
1570 }
1571
1572 /* Find cached rt in the hash table inside passed in rt
1573  * Caller has to hold rcu_read_lock()
1574  */
1575 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1576                                            const struct in6_addr *daddr,
1577                                            const struct in6_addr *saddr)
1578 {
1579         const struct in6_addr *src_key = NULL;
1580         struct rt6_exception_bucket *bucket;
1581         struct rt6_exception *rt6_ex;
1582         struct rt6_info *ret = NULL;
1583
1584 #ifdef CONFIG_IPV6_SUBTREES
1585         /* fib6i_src.plen != 0 indicates f6i is in subtree
1586          * and exception table is indexed by a hash of
1587          * both fib6_dst and fib6_src.
1588          * However, the src addr used to create the hash
1589          * might not be exactly the passed in saddr which
1590          * is a /128 addr from the flow.
1591          * So we need to use f6i->fib6_src to redo lookup
1592          * if the passed in saddr does not find anything.
1593          * (See the logic in ip6_rt_cache_alloc() on how
1594          * rt->rt6i_src is updated.)
1595          */
1596         if (res->f6i->fib6_src.plen)
1597                 src_key = saddr;
1598 find_ex:
1599 #endif
1600         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1601         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1602
1603         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1604                 ret = rt6_ex->rt6i;
1605
1606 #ifdef CONFIG_IPV6_SUBTREES
1607         /* Use fib6_src as src_key and redo lookup */
1608         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1609                 src_key = &res->f6i->fib6_src.addr;
1610                 goto find_ex;
1611         }
1612 #endif
1613
1614         return ret;
1615 }
1616
1617 /* Remove the passed in cached rt from the hash table that contains it */
1618 static int rt6_remove_exception_rt(struct rt6_info *rt)
1619 {
1620         struct rt6_exception_bucket *bucket;
1621         struct in6_addr *src_key = NULL;
1622         struct rt6_exception *rt6_ex;
1623         struct fib6_info *from;
1624         int err;
1625
1626         from = rcu_dereference(rt->from);
1627         if (!from ||
1628             !(rt->rt6i_flags & RTF_CACHE))
1629                 return -EINVAL;
1630
1631         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1632                 return -ENOENT;
1633
1634         spin_lock_bh(&rt6_exception_lock);
1635         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1636                                     lockdep_is_held(&rt6_exception_lock));
1637 #ifdef CONFIG_IPV6_SUBTREES
1638         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1639          * and exception table is indexed by a hash of
1640          * both rt6i_dst and rt6i_src.
1641          * Otherwise, the exception table is indexed by
1642          * a hash of only rt6i_dst.
1643          */
1644         if (from->fib6_src.plen)
1645                 src_key = &rt->rt6i_src.addr;
1646 #endif
1647         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1648                                                &rt->rt6i_dst.addr,
1649                                                src_key);
1650         if (rt6_ex) {
1651                 rt6_remove_exception(bucket, rt6_ex);
1652                 err = 0;
1653         } else {
1654                 err = -ENOENT;
1655         }
1656
1657         spin_unlock_bh(&rt6_exception_lock);
1658         return err;
1659 }
1660
1661 /* Find rt6_ex which contains the passed in rt cache and
1662  * refresh its stamp
1663  */
1664 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1665 {
1666         struct rt6_exception_bucket *bucket;
1667         struct in6_addr *src_key = NULL;
1668         struct rt6_exception *rt6_ex;
1669         struct fib6_info *from;
1670
1671         rcu_read_lock();
1672         from = rcu_dereference(rt->from);
1673         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1674                 goto unlock;
1675
1676         bucket = rcu_dereference(from->rt6i_exception_bucket);
1677
1678 #ifdef CONFIG_IPV6_SUBTREES
1679         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1680          * and exception table is indexed by a hash of
1681          * both rt6i_dst and rt6i_src.
1682          * Otherwise, the exception table is indexed by
1683          * a hash of only rt6i_dst.
1684          */
1685         if (from->fib6_src.plen)
1686                 src_key = &rt->rt6i_src.addr;
1687 #endif
1688         rt6_ex = __rt6_find_exception_rcu(&bucket,
1689                                           &rt->rt6i_dst.addr,
1690                                           src_key);
1691         if (rt6_ex)
1692                 rt6_ex->stamp = jiffies;
1693
1694 unlock:
1695         rcu_read_unlock();
1696 }
1697
1698 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1699                                          struct rt6_info *rt, int mtu)
1700 {
1701         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1702          * lowest MTU in the path: always allow updating the route PMTU to
1703          * reflect PMTU decreases.
1704          *
1705          * If the new MTU is higher, and the route PMTU is equal to the local
1706          * MTU, this means the old MTU is the lowest in the path, so allow
1707          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1708          * handle this.
1709          */
1710
1711         if (dst_mtu(&rt->dst) >= mtu)
1712                 return true;
1713
1714         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1715                 return true;
1716
1717         return false;
1718 }
1719
1720 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1721                                        struct fib6_info *rt, int mtu)
1722 {
1723         struct rt6_exception_bucket *bucket;
1724         struct rt6_exception *rt6_ex;
1725         int i;
1726
1727         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728                                         lockdep_is_held(&rt6_exception_lock));
1729
1730         if (!bucket)
1731                 return;
1732
1733         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1734                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1735                         struct rt6_info *entry = rt6_ex->rt6i;
1736
1737                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1738                          * route), the metrics of its rt->from have already
1739                          * been updated.
1740                          */
1741                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1742                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1743                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1744                 }
1745                 bucket++;
1746         }
1747 }
1748
1749 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1750
1751 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1752                                         struct in6_addr *gateway)
1753 {
1754         struct rt6_exception_bucket *bucket;
1755         struct rt6_exception *rt6_ex;
1756         struct hlist_node *tmp;
1757         int i;
1758
1759         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1760                 return;
1761
1762         spin_lock_bh(&rt6_exception_lock);
1763         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1764                                      lockdep_is_held(&rt6_exception_lock));
1765
1766         if (bucket) {
1767                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1768                         hlist_for_each_entry_safe(rt6_ex, tmp,
1769                                                   &bucket->chain, hlist) {
1770                                 struct rt6_info *entry = rt6_ex->rt6i;
1771
1772                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1773                                     RTF_CACHE_GATEWAY &&
1774                                     ipv6_addr_equal(gateway,
1775                                                     &entry->rt6i_gateway)) {
1776                                         rt6_remove_exception(bucket, rt6_ex);
1777                                 }
1778                         }
1779                         bucket++;
1780                 }
1781         }
1782
1783         spin_unlock_bh(&rt6_exception_lock);
1784 }
1785
1786 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1787                                       struct rt6_exception *rt6_ex,
1788                                       struct fib6_gc_args *gc_args,
1789                                       unsigned long now)
1790 {
1791         struct rt6_info *rt = rt6_ex->rt6i;
1792
1793         /* we are pruning and obsoleting aged-out and non gateway exceptions
1794          * even if others have still references to them, so that on next
1795          * dst_check() such references can be dropped.
1796          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1797          * expired, independently from their aging, as per RFC 8201 section 4
1798          */
1799         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1800                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1801                         RT6_TRACE("aging clone %p\n", rt);
1802                         rt6_remove_exception(bucket, rt6_ex);
1803                         return;
1804                 }
1805         } else if (time_after(jiffies, rt->dst.expires)) {
1806                 RT6_TRACE("purging expired route %p\n", rt);
1807                 rt6_remove_exception(bucket, rt6_ex);
1808                 return;
1809         }
1810
1811         if (rt->rt6i_flags & RTF_GATEWAY) {
1812                 struct neighbour *neigh;
1813                 __u8 neigh_flags = 0;
1814
1815                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1816                 if (neigh)
1817                         neigh_flags = neigh->flags;
1818
1819                 if (!(neigh_flags & NTF_ROUTER)) {
1820                         RT6_TRACE("purging route %p via non-router but gateway\n",
1821                                   rt);
1822                         rt6_remove_exception(bucket, rt6_ex);
1823                         return;
1824                 }
1825         }
1826
1827         gc_args->more++;
1828 }
1829
1830 void rt6_age_exceptions(struct fib6_info *rt,
1831                         struct fib6_gc_args *gc_args,
1832                         unsigned long now)
1833 {
1834         struct rt6_exception_bucket *bucket;
1835         struct rt6_exception *rt6_ex;
1836         struct hlist_node *tmp;
1837         int i;
1838
1839         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1840                 return;
1841
1842         rcu_read_lock_bh();
1843         spin_lock(&rt6_exception_lock);
1844         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1845                                     lockdep_is_held(&rt6_exception_lock));
1846
1847         if (bucket) {
1848                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1849                         hlist_for_each_entry_safe(rt6_ex, tmp,
1850                                                   &bucket->chain, hlist) {
1851                                 rt6_age_examine_exception(bucket, rt6_ex,
1852                                                           gc_args, now);
1853                         }
1854                         bucket++;
1855                 }
1856         }
1857         spin_unlock(&rt6_exception_lock);
1858         rcu_read_unlock_bh();
1859 }
1860
1861 /* must be called with rcu lock held */
1862 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1863                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1864 {
1865         struct fib6_node *fn, *saved_fn;
1866
1867         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1868         saved_fn = fn;
1869
1870         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1871                 oif = 0;
1872
1873 redo_rt6_select:
1874         rt6_select(net, fn, oif, res, strict);
1875         if (res->f6i == net->ipv6.fib6_null_entry) {
1876                 fn = fib6_backtrack(fn, &fl6->saddr);
1877                 if (fn)
1878                         goto redo_rt6_select;
1879                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1880                         /* also consider unreachable route */
1881                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1882                         fn = saved_fn;
1883                         goto redo_rt6_select;
1884                 }
1885         }
1886
1887         trace_fib6_table_lookup(net, res, table, fl6);
1888
1889         return 0;
1890 }
1891
1892 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1893                                int oif, struct flowi6 *fl6,
1894                                const struct sk_buff *skb, int flags)
1895 {
1896         struct fib6_result res = {};
1897         struct rt6_info *rt;
1898         int strict = 0;
1899
1900         strict |= flags & RT6_LOOKUP_F_IFACE;
1901         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1902         if (net->ipv6.devconf_all->forwarding == 0)
1903                 strict |= RT6_LOOKUP_F_REACHABLE;
1904
1905         rcu_read_lock();
1906
1907         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1908         if (res.f6i == net->ipv6.fib6_null_entry) {
1909                 rt = net->ipv6.ip6_null_entry;
1910                 rcu_read_unlock();
1911                 dst_hold(&rt->dst);
1912                 return rt;
1913         }
1914
1915         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1916
1917         /*Search through exception table */
1918         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1919         if (rt) {
1920                 if (ip6_hold_safe(net, &rt))
1921                         dst_use_noref(&rt->dst, jiffies);
1922
1923                 rcu_read_unlock();
1924                 return rt;
1925         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1926                             !res.nh->fib_nh_gw_family)) {
1927                 /* Create a RTF_CACHE clone which will not be
1928                  * owned by the fib6 tree.  It is for the special case where
1929                  * the daddr in the skb during the neighbor look-up is different
1930                  * from the fl6->daddr used to look-up route here.
1931                  */
1932                 struct rt6_info *uncached_rt;
1933
1934                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1935
1936                 rcu_read_unlock();
1937
1938                 if (uncached_rt) {
1939                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1940                          * No need for another dst_hold()
1941                          */
1942                         rt6_uncached_list_add(uncached_rt);
1943                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1944                 } else {
1945                         uncached_rt = net->ipv6.ip6_null_entry;
1946                         dst_hold(&uncached_rt->dst);
1947                 }
1948
1949                 return uncached_rt;
1950         } else {
1951                 /* Get a percpu copy */
1952
1953                 struct rt6_info *pcpu_rt;
1954
1955                 local_bh_disable();
1956                 pcpu_rt = rt6_get_pcpu_route(&res);
1957
1958                 if (!pcpu_rt)
1959                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1960
1961                 local_bh_enable();
1962                 rcu_read_unlock();
1963
1964                 return pcpu_rt;
1965         }
1966 }
1967 EXPORT_SYMBOL_GPL(ip6_pol_route);
1968
1969 static struct rt6_info *ip6_pol_route_input(struct net *net,
1970                                             struct fib6_table *table,
1971                                             struct flowi6 *fl6,
1972                                             const struct sk_buff *skb,
1973                                             int flags)
1974 {
1975         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1976 }
1977
1978 struct dst_entry *ip6_route_input_lookup(struct net *net,
1979                                          struct net_device *dev,
1980                                          struct flowi6 *fl6,
1981                                          const struct sk_buff *skb,
1982                                          int flags)
1983 {
1984         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1988 }
1989 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1990
1991 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1992                                   struct flow_keys *keys,
1993                                   struct flow_keys *flkeys)
1994 {
1995         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1996         const struct ipv6hdr *key_iph = outer_iph;
1997         struct flow_keys *_flkeys = flkeys;
1998         const struct ipv6hdr *inner_iph;
1999         const struct icmp6hdr *icmph;
2000         struct ipv6hdr _inner_iph;
2001         struct icmp6hdr _icmph;
2002
2003         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2004                 goto out;
2005
2006         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2007                                    sizeof(_icmph), &_icmph);
2008         if (!icmph)
2009                 goto out;
2010
2011         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2012             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2013             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2014             icmph->icmp6_type != ICMPV6_PARAMPROB)
2015                 goto out;
2016
2017         inner_iph = skb_header_pointer(skb,
2018                                        skb_transport_offset(skb) + sizeof(*icmph),
2019                                        sizeof(_inner_iph), &_inner_iph);
2020         if (!inner_iph)
2021                 goto out;
2022
2023         key_iph = inner_iph;
2024         _flkeys = NULL;
2025 out:
2026         if (_flkeys) {
2027                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2028                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2029                 keys->tags.flow_label = _flkeys->tags.flow_label;
2030                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2031         } else {
2032                 keys->addrs.v6addrs.src = key_iph->saddr;
2033                 keys->addrs.v6addrs.dst = key_iph->daddr;
2034                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2035                 keys->basic.ip_proto = key_iph->nexthdr;
2036         }
2037 }
2038
2039 /* if skb is set it will be used and fl6 can be NULL */
2040 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2041                        const struct sk_buff *skb, struct flow_keys *flkeys)
2042 {
2043         struct flow_keys hash_keys;
2044         u32 mhash;
2045
2046         switch (ip6_multipath_hash_policy(net)) {
2047         case 0:
2048                 memset(&hash_keys, 0, sizeof(hash_keys));
2049                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2050                 if (skb) {
2051                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2052                 } else {
2053                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2054                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2055                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2056                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2057                 }
2058                 break;
2059         case 1:
2060                 if (skb) {
2061                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062                         struct flow_keys keys;
2063
2064                         /* short-circuit if we already have L4 hash present */
2065                         if (skb->l4_hash)
2066                                 return skb_get_hash_raw(skb) >> 1;
2067
2068                         memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070                         if (!flkeys) {
2071                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2072                                 flkeys = &keys;
2073                         }
2074                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2075                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2076                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2077                         hash_keys.ports.src = flkeys->ports.src;
2078                         hash_keys.ports.dst = flkeys->ports.dst;
2079                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2080                 } else {
2081                         memset(&hash_keys, 0, sizeof(hash_keys));
2082                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2083                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2084                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2085                         hash_keys.ports.src = fl6->fl6_sport;
2086                         hash_keys.ports.dst = fl6->fl6_dport;
2087                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2088                 }
2089                 break;
2090         }
2091         mhash = flow_hash_from_keys(&hash_keys);
2092
2093         return mhash >> 1;
2094 }
2095
2096 void ip6_route_input(struct sk_buff *skb)
2097 {
2098         const struct ipv6hdr *iph = ipv6_hdr(skb);
2099         struct net *net = dev_net(skb->dev);
2100         int flags = RT6_LOOKUP_F_HAS_SADDR;
2101         struct ip_tunnel_info *tun_info;
2102         struct flowi6 fl6 = {
2103                 .flowi6_iif = skb->dev->ifindex,
2104                 .daddr = iph->daddr,
2105                 .saddr = iph->saddr,
2106                 .flowlabel = ip6_flowinfo(iph),
2107                 .flowi6_mark = skb->mark,
2108                 .flowi6_proto = iph->nexthdr,
2109         };
2110         struct flow_keys *flkeys = NULL, _flkeys;
2111
2112         tun_info = skb_tunnel_info(skb);
2113         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2114                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2115
2116         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2117                 flkeys = &_flkeys;
2118
2119         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2120                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2121         skb_dst_drop(skb);
2122         skb_dst_set(skb,
2123                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2124 }
2125
2126 static struct rt6_info *ip6_pol_route_output(struct net *net,
2127                                              struct fib6_table *table,
2128                                              struct flowi6 *fl6,
2129                                              const struct sk_buff *skb,
2130                                              int flags)
2131 {
2132         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2133 }
2134
2135 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2136                                          struct flowi6 *fl6, int flags)
2137 {
2138         bool any_src;
2139
2140         if (ipv6_addr_type(&fl6->daddr) &
2141             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2142                 struct dst_entry *dst;
2143
2144                 dst = l3mdev_link_scope_lookup(net, fl6);
2145                 if (dst)
2146                         return dst;
2147         }
2148
2149         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2150
2151         any_src = ipv6_addr_any(&fl6->saddr);
2152         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2153             (fl6->flowi6_oif && any_src))
2154                 flags |= RT6_LOOKUP_F_IFACE;
2155
2156         if (!any_src)
2157                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2158         else if (sk)
2159                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2160
2161         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2162 }
2163 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2164
2165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2166 {
2167         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2168         struct net_device *loopback_dev = net->loopback_dev;
2169         struct dst_entry *new = NULL;
2170
2171         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2172                        DST_OBSOLETE_DEAD, 0);
2173         if (rt) {
2174                 rt6_info_init(rt);
2175                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2176
2177                 new = &rt->dst;
2178                 new->__use = 1;
2179                 new->input = dst_discard;
2180                 new->output = dst_discard_out;
2181
2182                 dst_copy_metrics(new, &ort->dst);
2183
2184                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2185                 rt->rt6i_gateway = ort->rt6i_gateway;
2186                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2187
2188                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2189 #ifdef CONFIG_IPV6_SUBTREES
2190                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2191 #endif
2192         }
2193
2194         dst_release(dst_orig);
2195         return new ? new : ERR_PTR(-ENOMEM);
2196 }
2197
2198 /*
2199  *      Destination cache support functions
2200  */
2201
2202 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2203 {
2204         u32 rt_cookie = 0;
2205
2206         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2207                 return false;
2208
2209         if (fib6_check_expired(f6i))
2210                 return false;
2211
2212         return true;
2213 }
2214
2215 static struct dst_entry *rt6_check(struct rt6_info *rt,
2216                                    struct fib6_info *from,
2217                                    u32 cookie)
2218 {
2219         u32 rt_cookie = 0;
2220
2221         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2222             rt_cookie != cookie)
2223                 return NULL;
2224
2225         if (rt6_check_expired(rt))
2226                 return NULL;
2227
2228         return &rt->dst;
2229 }
2230
2231 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2232                                             struct fib6_info *from,
2233                                             u32 cookie)
2234 {
2235         if (!__rt6_check_expired(rt) &&
2236             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2237             fib6_check(from, cookie))
2238                 return &rt->dst;
2239         else
2240                 return NULL;
2241 }
2242
2243 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2244 {
2245         struct dst_entry *dst_ret;
2246         struct fib6_info *from;
2247         struct rt6_info *rt;
2248
2249         rt = container_of(dst, struct rt6_info, dst);
2250
2251         rcu_read_lock();
2252
2253         /* All IPV6 dsts are created with ->obsolete set to the value
2254          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2255          * into this function always.
2256          */
2257
2258         from = rcu_dereference(rt->from);
2259
2260         if (from && (rt->rt6i_flags & RTF_PCPU ||
2261             unlikely(!list_empty(&rt->rt6i_uncached))))
2262                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2263         else
2264                 dst_ret = rt6_check(rt, from, cookie);
2265
2266         rcu_read_unlock();
2267
2268         return dst_ret;
2269 }
2270
2271 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2272 {
2273         struct rt6_info *rt = (struct rt6_info *) dst;
2274
2275         if (rt) {
2276                 if (rt->rt6i_flags & RTF_CACHE) {
2277                         rcu_read_lock();
2278                         if (rt6_check_expired(rt)) {
2279                                 rt6_remove_exception_rt(rt);
2280                                 dst = NULL;
2281                         }
2282                         rcu_read_unlock();
2283                 } else {
2284                         dst_release(dst);
2285                         dst = NULL;
2286                 }
2287         }
2288         return dst;
2289 }
2290
2291 static void ip6_link_failure(struct sk_buff *skb)
2292 {
2293         struct rt6_info *rt;
2294
2295         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2296
2297         rt = (struct rt6_info *) skb_dst(skb);
2298         if (rt) {
2299                 rcu_read_lock();
2300                 if (rt->rt6i_flags & RTF_CACHE) {
2301                         rt6_remove_exception_rt(rt);
2302                 } else {
2303                         struct fib6_info *from;
2304                         struct fib6_node *fn;
2305
2306                         from = rcu_dereference(rt->from);
2307                         if (from) {
2308                                 fn = rcu_dereference(from->fib6_node);
2309                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2310                                         fn->fn_sernum = -1;
2311                         }
2312                 }
2313                 rcu_read_unlock();
2314         }
2315 }
2316
2317 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2318 {
2319         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2320                 struct fib6_info *from;
2321
2322                 rcu_read_lock();
2323                 from = rcu_dereference(rt0->from);
2324                 if (from)
2325                         rt0->dst.expires = from->expires;
2326                 rcu_read_unlock();
2327         }
2328
2329         dst_set_expires(&rt0->dst, timeout);
2330         rt0->rt6i_flags |= RTF_EXPIRES;
2331 }
2332
2333 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2334 {
2335         struct net *net = dev_net(rt->dst.dev);
2336
2337         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2338         rt->rt6i_flags |= RTF_MODIFIED;
2339         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2340 }
2341
2342 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2343 {
2344         return !(rt->rt6i_flags & RTF_CACHE) &&
2345                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2346 }
2347
2348 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2349                                  const struct ipv6hdr *iph, u32 mtu)
2350 {
2351         const struct in6_addr *daddr, *saddr;
2352         struct rt6_info *rt6 = (struct rt6_info *)dst;
2353
2354         if (dst_metric_locked(dst, RTAX_MTU))
2355                 return;
2356
2357         if (iph) {
2358                 daddr = &iph->daddr;
2359                 saddr = &iph->saddr;
2360         } else if (sk) {
2361                 daddr = &sk->sk_v6_daddr;
2362                 saddr = &inet6_sk(sk)->saddr;
2363         } else {
2364                 daddr = NULL;
2365                 saddr = NULL;
2366         }
2367         dst_confirm_neigh(dst, daddr);
2368         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2369         if (mtu >= dst_mtu(dst))
2370                 return;
2371
2372         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2373                 rt6_do_update_pmtu(rt6, mtu);
2374                 /* update rt6_ex->stamp for cache */
2375                 if (rt6->rt6i_flags & RTF_CACHE)
2376                         rt6_update_exception_stamp_rt(rt6);
2377         } else if (daddr) {
2378                 struct fib6_result res = {};
2379                 struct rt6_info *nrt6;
2380
2381                 rcu_read_lock();
2382                 res.f6i = rcu_dereference(rt6->from);
2383                 if (!res.f6i) {
2384                         rcu_read_unlock();
2385                         return;
2386                 }
2387                 res.nh = &res.f6i->fib6_nh;
2388                 res.fib6_flags = res.f6i->fib6_flags;
2389                 res.fib6_type = res.f6i->fib6_type;
2390
2391                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2392                 if (nrt6) {
2393                         rt6_do_update_pmtu(nrt6, mtu);
2394                         if (rt6_insert_exception(nrt6, &res))
2395                                 dst_release_immediate(&nrt6->dst);
2396                 }
2397                 rcu_read_unlock();
2398         }
2399 }
2400
2401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402                                struct sk_buff *skb, u32 mtu)
2403 {
2404         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2405 }
2406
2407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2408                      int oif, u32 mark, kuid_t uid)
2409 {
2410         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2411         struct dst_entry *dst;
2412         struct flowi6 fl6 = {
2413                 .flowi6_oif = oif,
2414                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2415                 .daddr = iph->daddr,
2416                 .saddr = iph->saddr,
2417                 .flowlabel = ip6_flowinfo(iph),
2418                 .flowi6_uid = uid,
2419         };
2420
2421         dst = ip6_route_output(net, NULL, &fl6);
2422         if (!dst->error)
2423                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2424         dst_release(dst);
2425 }
2426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2427
2428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2429 {
2430         int oif = sk->sk_bound_dev_if;
2431         struct dst_entry *dst;
2432
2433         if (!oif && skb->dev)
2434                 oif = l3mdev_master_ifindex(skb->dev);
2435
2436         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2437
2438         dst = __sk_dst_get(sk);
2439         if (!dst || !dst->obsolete ||
2440             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2441                 return;
2442
2443         bh_lock_sock(sk);
2444         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2445                 ip6_datagram_dst_update(sk, false);
2446         bh_unlock_sock(sk);
2447 }
2448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2449
2450 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2451                            const struct flowi6 *fl6)
2452 {
2453 #ifdef CONFIG_IPV6_SUBTREES
2454         struct ipv6_pinfo *np = inet6_sk(sk);
2455 #endif
2456
2457         ip6_dst_store(sk, dst,
2458                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2459                       &sk->sk_v6_daddr : NULL,
2460 #ifdef CONFIG_IPV6_SUBTREES
2461                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2462                       &np->saddr :
2463 #endif
2464                       NULL);
2465 }
2466
2467 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2468                                   struct flowi6 *fl6,
2469                                   const struct in6_addr *gw,
2470                                   struct rt6_info **ret)
2471 {
2472         const struct fib6_nh *nh = res->nh;
2473
2474         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2475             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2476                 return false;
2477
2478         /* rt_cache's gateway might be different from its 'parent'
2479          * in the case of an ip redirect.
2480          * So we keep searching in the exception table if the gateway
2481          * is different.
2482          */
2483         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2484                 struct rt6_info *rt_cache;
2485
2486                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2487                 if (rt_cache &&
2488                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2489                         *ret = rt_cache;
2490                         return true;
2491                 }
2492                 return false;
2493         }
2494         return true;
2495 }
2496
2497 /* Handle redirects */
2498 struct ip6rd_flowi {
2499         struct flowi6 fl6;
2500         struct in6_addr gateway;
2501 };
2502
2503 static struct rt6_info *__ip6_route_redirect(struct net *net,
2504                                              struct fib6_table *table,
2505                                              struct flowi6 *fl6,
2506                                              const struct sk_buff *skb,
2507                                              int flags)
2508 {
2509         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510         struct rt6_info *ret = NULL;
2511         struct fib6_result res = {};
2512         struct fib6_info *rt;
2513         struct fib6_node *fn;
2514
2515         /* l3mdev_update_flow overrides oif if the device is enslaved; in
2516          * this case we must match on the real ingress device, so reset it
2517          */
2518         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2519                 fl6->flowi6_oif = skb->dev->ifindex;
2520
2521         /* Get the "current" route for this destination and
2522          * check if the redirect has come from appropriate router.
2523          *
2524          * RFC 4861 specifies that redirects should only be
2525          * accepted if they come from the nexthop to the target.
2526          * Due to the way the routes are chosen, this notion
2527          * is a bit fuzzy and one might need to check all possible
2528          * routes.
2529          */
2530
2531         rcu_read_lock();
2532         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2533 restart:
2534         for_each_fib6_node_rt_rcu(fn) {
2535                 res.f6i = rt;
2536                 res.nh = &rt->fib6_nh;
2537
2538                 if (fib6_check_expired(rt))
2539                         continue;
2540                 if (rt->fib6_flags & RTF_REJECT)
2541                         break;
2542                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2543                         goto out;
2544         }
2545
2546         if (!rt)
2547                 rt = net->ipv6.fib6_null_entry;
2548         else if (rt->fib6_flags & RTF_REJECT) {
2549                 ret = net->ipv6.ip6_null_entry;
2550                 goto out;
2551         }
2552
2553         if (rt == net->ipv6.fib6_null_entry) {
2554                 fn = fib6_backtrack(fn, &fl6->saddr);
2555                 if (fn)
2556                         goto restart;
2557         }
2558
2559         res.f6i = rt;
2560         res.nh = &rt->fib6_nh;
2561 out:
2562         if (ret) {
2563                 ip6_hold_safe(net, &ret);
2564         } else {
2565                 res.fib6_flags = res.f6i->fib6_flags;
2566                 res.fib6_type = res.f6i->fib6_type;
2567                 ret = ip6_create_rt_rcu(&res);
2568         }
2569
2570         rcu_read_unlock();
2571
2572         trace_fib6_table_lookup(net, &res, table, fl6);
2573         return ret;
2574 };
2575
2576 static struct dst_entry *ip6_route_redirect(struct net *net,
2577                                             const struct flowi6 *fl6,
2578                                             const struct sk_buff *skb,
2579                                             const struct in6_addr *gateway)
2580 {
2581         int flags = RT6_LOOKUP_F_HAS_SADDR;
2582         struct ip6rd_flowi rdfl;
2583
2584         rdfl.fl6 = *fl6;
2585         rdfl.gateway = *gateway;
2586
2587         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2588                                 flags, __ip6_route_redirect);
2589 }
2590
2591 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2592                   kuid_t uid)
2593 {
2594         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2595         struct dst_entry *dst;
2596         struct flowi6 fl6 = {
2597                 .flowi6_iif = LOOPBACK_IFINDEX,
2598                 .flowi6_oif = oif,
2599                 .flowi6_mark = mark,
2600                 .daddr = iph->daddr,
2601                 .saddr = iph->saddr,
2602                 .flowlabel = ip6_flowinfo(iph),
2603                 .flowi6_uid = uid,
2604         };
2605
2606         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2607         rt6_do_redirect(dst, NULL, skb);
2608         dst_release(dst);
2609 }
2610 EXPORT_SYMBOL_GPL(ip6_redirect);
2611
2612 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2613 {
2614         const struct ipv6hdr *iph = ipv6_hdr(skb);
2615         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2616         struct dst_entry *dst;
2617         struct flowi6 fl6 = {
2618                 .flowi6_iif = LOOPBACK_IFINDEX,
2619                 .flowi6_oif = oif,
2620                 .daddr = msg->dest,
2621                 .saddr = iph->daddr,
2622                 .flowi6_uid = sock_net_uid(net, NULL),
2623         };
2624
2625         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2626         rt6_do_redirect(dst, NULL, skb);
2627         dst_release(dst);
2628 }
2629
2630 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2631 {
2632         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2633                      sk->sk_uid);
2634 }
2635 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2636
2637 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2638 {
2639         struct net_device *dev = dst->dev;
2640         unsigned int mtu = dst_mtu(dst);
2641         struct net *net = dev_net(dev);
2642
2643         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2644
2645         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2646                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2647
2648         /*
2649          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2650          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2651          * IPV6_MAXPLEN is also valid and means: "any MSS,
2652          * rely only on pmtu discovery"
2653          */
2654         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2655                 mtu = IPV6_MAXPLEN;
2656         return mtu;
2657 }
2658
2659 static unsigned int ip6_mtu(const struct dst_entry *dst)
2660 {
2661         struct inet6_dev *idev;
2662         unsigned int mtu;
2663
2664         mtu = dst_metric_raw(dst, RTAX_MTU);
2665         if (mtu)
2666                 goto out;
2667
2668         mtu = IPV6_MIN_MTU;
2669
2670         rcu_read_lock();
2671         idev = __in6_dev_get(dst->dev);
2672         if (idev)
2673                 mtu = idev->cnf.mtu6;
2674         rcu_read_unlock();
2675
2676 out:
2677         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2678
2679         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2680 }
2681
2682 /* MTU selection:
2683  * 1. mtu on route is locked - use it
2684  * 2. mtu from nexthop exception
2685  * 3. mtu from egress device
2686  *
2687  * based on ip6_dst_mtu_forward and exception logic of
2688  * rt6_find_cached_rt; called with rcu_read_lock
2689  */
2690 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2691                       const struct in6_addr *daddr,
2692                       const struct in6_addr *saddr)
2693 {
2694         const struct fib6_nh *nh = res->nh;
2695         struct fib6_info *f6i = res->f6i;
2696         struct inet6_dev *idev;
2697         struct rt6_info *rt;
2698         u32 mtu = 0;
2699
2700         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2701                 mtu = f6i->fib6_pmtu;
2702                 if (mtu)
2703                         goto out;
2704         }
2705
2706         rt = rt6_find_cached_rt(res, daddr, saddr);
2707         if (unlikely(rt)) {
2708                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2709         } else {
2710                 struct net_device *dev = nh->fib_nh_dev;
2711
2712                 mtu = IPV6_MIN_MTU;
2713                 idev = __in6_dev_get(dev);
2714                 if (idev && idev->cnf.mtu6 > mtu)
2715                         mtu = idev->cnf.mtu6;
2716         }
2717
2718         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2719 out:
2720         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2721 }
2722
2723 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2724                                   struct flowi6 *fl6)
2725 {
2726         struct dst_entry *dst;
2727         struct rt6_info *rt;
2728         struct inet6_dev *idev = in6_dev_get(dev);
2729         struct net *net = dev_net(dev);
2730
2731         if (unlikely(!idev))
2732                 return ERR_PTR(-ENODEV);
2733
2734         rt = ip6_dst_alloc(net, dev, 0);
2735         if (unlikely(!rt)) {
2736                 in6_dev_put(idev);
2737                 dst = ERR_PTR(-ENOMEM);
2738                 goto out;
2739         }
2740
2741         rt->dst.flags |= DST_HOST;
2742         rt->dst.input = ip6_input;
2743         rt->dst.output  = ip6_output;
2744         rt->rt6i_gateway  = fl6->daddr;
2745         rt->rt6i_dst.addr = fl6->daddr;
2746         rt->rt6i_dst.plen = 128;
2747         rt->rt6i_idev     = idev;
2748         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2749
2750         /* Add this dst into uncached_list so that rt6_disable_ip() can
2751          * do proper release of the net_device
2752          */
2753         rt6_uncached_list_add(rt);
2754         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2755
2756         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2757
2758 out:
2759         return dst;
2760 }
2761
2762 static int ip6_dst_gc(struct dst_ops *ops)
2763 {
2764         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2765         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2766         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2767         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2768         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2769         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2770         int entries;
2771
2772         entries = dst_entries_get_fast(ops);
2773         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2774             entries <= rt_max_size)
2775                 goto out;
2776
2777         net->ipv6.ip6_rt_gc_expire++;
2778         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2779         entries = dst_entries_get_slow(ops);
2780         if (entries < ops->gc_thresh)
2781                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2782 out:
2783         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2784         return entries > rt_max_size;
2785 }
2786
2787 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2788                                             struct fib6_config *cfg,
2789                                             const struct in6_addr *gw_addr,
2790                                             u32 tbid, int flags)
2791 {
2792         struct flowi6 fl6 = {
2793                 .flowi6_oif = cfg->fc_ifindex,
2794                 .daddr = *gw_addr,
2795                 .saddr = cfg->fc_prefsrc,
2796         };
2797         struct fib6_table *table;
2798         struct rt6_info *rt;
2799
2800         table = fib6_get_table(net, tbid);
2801         if (!table)
2802                 return NULL;
2803
2804         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2805                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2806
2807         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2808         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2809
2810         /* if table lookup failed, fall back to full lookup */
2811         if (rt == net->ipv6.ip6_null_entry) {
2812                 ip6_rt_put(rt);
2813                 rt = NULL;
2814         }
2815
2816         return rt;
2817 }
2818
2819 static int ip6_route_check_nh_onlink(struct net *net,
2820                                      struct fib6_config *cfg,
2821                                      const struct net_device *dev,
2822                                      struct netlink_ext_ack *extack)
2823 {
2824         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2825         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2826         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2827         struct fib6_info *from;
2828         struct rt6_info *grt;
2829         int err;
2830
2831         err = 0;
2832         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2833         if (grt) {
2834                 rcu_read_lock();
2835                 from = rcu_dereference(grt->from);
2836                 if (!grt->dst.error &&
2837                     /* ignore match if it is the default route */
2838                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2839                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2840                         NL_SET_ERR_MSG(extack,
2841                                        "Nexthop has invalid gateway or device mismatch");
2842                         err = -EINVAL;
2843                 }
2844                 rcu_read_unlock();
2845
2846                 ip6_rt_put(grt);
2847         }
2848
2849         return err;
2850 }
2851
2852 static int ip6_route_check_nh(struct net *net,
2853                               struct fib6_config *cfg,
2854                               struct net_device **_dev,
2855                               struct inet6_dev **idev)
2856 {
2857         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2858         struct net_device *dev = _dev ? *_dev : NULL;
2859         struct rt6_info *grt = NULL;
2860         int err = -EHOSTUNREACH;
2861
2862         if (cfg->fc_table) {
2863                 int flags = RT6_LOOKUP_F_IFACE;
2864
2865                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2866                                           cfg->fc_table, flags);
2867                 if (grt) {
2868                         if (grt->rt6i_flags & RTF_GATEWAY ||
2869                             (dev && dev != grt->dst.dev)) {
2870                                 ip6_rt_put(grt);
2871                                 grt = NULL;
2872                         }
2873                 }
2874         }
2875
2876         if (!grt)
2877                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2878
2879         if (!grt)
2880                 goto out;
2881
2882         if (dev) {
2883                 if (dev != grt->dst.dev) {
2884                         ip6_rt_put(grt);
2885                         goto out;
2886                 }
2887         } else {
2888                 *_dev = dev = grt->dst.dev;
2889                 *idev = grt->rt6i_idev;
2890                 dev_hold(dev);
2891                 in6_dev_hold(grt->rt6i_idev);
2892         }
2893
2894         if (!(grt->rt6i_flags & RTF_GATEWAY))
2895                 err = 0;
2896
2897         ip6_rt_put(grt);
2898
2899 out:
2900         return err;
2901 }
2902
2903 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2904                            struct net_device **_dev, struct inet6_dev **idev,
2905                            struct netlink_ext_ack *extack)
2906 {
2907         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2908         int gwa_type = ipv6_addr_type(gw_addr);
2909         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2910         const struct net_device *dev = *_dev;
2911         bool need_addr_check = !dev;
2912         int err = -EINVAL;
2913
2914         /* if gw_addr is local we will fail to detect this in case
2915          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2916          * will return already-added prefix route via interface that
2917          * prefix route was assigned to, which might be non-loopback.
2918          */
2919         if (dev &&
2920             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2921                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2922                 goto out;
2923         }
2924
2925         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2926                 /* IPv6 strictly inhibits using not link-local
2927                  * addresses as nexthop address.
2928                  * Otherwise, router will not able to send redirects.
2929                  * It is very good, but in some (rare!) circumstances
2930                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2931                  * some exceptions. --ANK
2932                  * We allow IPv4-mapped nexthops to support RFC4798-type
2933                  * addressing
2934                  */
2935                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2936                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2937                         goto out;
2938                 }
2939
2940                 if (cfg->fc_flags & RTNH_F_ONLINK)
2941                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2942                 else
2943                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2944
2945                 if (err)
2946                         goto out;
2947         }
2948
2949         /* reload in case device was changed */
2950         dev = *_dev;
2951
2952         err = -EINVAL;
2953         if (!dev) {
2954                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2955                 goto out;
2956         } else if (dev->flags & IFF_LOOPBACK) {
2957                 NL_SET_ERR_MSG(extack,
2958                                "Egress device can not be loopback device for this route");
2959                 goto out;
2960         }
2961
2962         /* if we did not check gw_addr above, do so now that the
2963          * egress device has been resolved.
2964          */
2965         if (need_addr_check &&
2966             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2967                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2968                 goto out;
2969         }
2970
2971         err = 0;
2972 out:
2973         return err;
2974 }
2975
2976 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2977 {
2978         if ((flags & RTF_REJECT) ||
2979             (dev && (dev->flags & IFF_LOOPBACK) &&
2980              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2981              !(flags & RTF_LOCAL)))
2982                 return true;
2983
2984         return false;
2985 }
2986
2987 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2988                  struct fib6_config *cfg, gfp_t gfp_flags,
2989                  struct netlink_ext_ack *extack)
2990 {
2991         struct net_device *dev = NULL;
2992         struct inet6_dev *idev = NULL;
2993         int addr_type;
2994         int err;
2995
2996         fib6_nh->fib_nh_family = AF_INET6;
2997
2998         err = -ENODEV;
2999         if (cfg->fc_ifindex) {
3000                 dev = dev_get_by_index(net, cfg->fc_ifindex);
3001                 if (!dev)
3002                         goto out;
3003                 idev = in6_dev_get(dev);
3004                 if (!idev)
3005                         goto out;
3006         }
3007
3008         if (cfg->fc_flags & RTNH_F_ONLINK) {
3009                 if (!dev) {
3010                         NL_SET_ERR_MSG(extack,
3011                                        "Nexthop device required for onlink");
3012                         goto out;
3013                 }
3014
3015                 if (!(dev->flags & IFF_UP)) {
3016                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3017                         err = -ENETDOWN;
3018                         goto out;
3019                 }
3020
3021                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3022         }
3023
3024         fib6_nh->fib_nh_weight = 1;
3025
3026         /* We cannot add true routes via loopback here,
3027          * they would result in kernel looping; promote them to reject routes
3028          */
3029         addr_type = ipv6_addr_type(&cfg->fc_dst);
3030         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3031                 /* hold loopback dev/idev if we haven't done so. */
3032                 if (dev != net->loopback_dev) {
3033                         if (dev) {
3034                                 dev_put(dev);
3035                                 in6_dev_put(idev);
3036                         }
3037                         dev = net->loopback_dev;
3038                         dev_hold(dev);
3039                         idev = in6_dev_get(dev);
3040                         if (!idev) {
3041                                 err = -ENODEV;
3042                                 goto out;
3043                         }
3044                 }
3045                 goto set_dev;
3046         }
3047
3048         if (cfg->fc_flags & RTF_GATEWAY) {
3049                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3050                 if (err)
3051                         goto out;
3052
3053                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3054                 fib6_nh->fib_nh_gw_family = AF_INET6;
3055         }
3056
3057         err = -ENODEV;
3058         if (!dev)
3059                 goto out;
3060
3061         if (idev->cnf.disable_ipv6) {
3062                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3063                 err = -EACCES;
3064                 goto out;
3065         }
3066
3067         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3068                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3069                 err = -ENETDOWN;
3070                 goto out;
3071         }
3072
3073         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3074             !netif_carrier_ok(dev))
3075                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3076
3077         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3078                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3079         if (err)
3080                 goto out;
3081 set_dev:
3082         fib6_nh->fib_nh_dev = dev;
3083         fib6_nh->fib_nh_oif = dev->ifindex;
3084         err = 0;
3085 out:
3086         if (idev)
3087                 in6_dev_put(idev);
3088
3089         if (err) {
3090                 lwtstate_put(fib6_nh->fib_nh_lws);
3091                 fib6_nh->fib_nh_lws = NULL;
3092                 if (dev)
3093                         dev_put(dev);
3094         }
3095
3096         return err;
3097 }
3098
3099 void fib6_nh_release(struct fib6_nh *fib6_nh)
3100 {
3101         fib_nh_common_release(&fib6_nh->nh_common);
3102 }
3103
3104 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3105                                               gfp_t gfp_flags,
3106                                               struct netlink_ext_ack *extack)
3107 {
3108         struct net *net = cfg->fc_nlinfo.nl_net;
3109         struct fib6_info *rt = NULL;
3110         struct fib6_table *table;
3111         int err = -EINVAL;
3112         int addr_type;
3113
3114         /* RTF_PCPU is an internal flag; can not be set by userspace */
3115         if (cfg->fc_flags & RTF_PCPU) {
3116                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3117                 goto out;
3118         }
3119
3120         /* RTF_CACHE is an internal flag; can not be set by userspace */
3121         if (cfg->fc_flags & RTF_CACHE) {
3122                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3123                 goto out;
3124         }
3125
3126         if (cfg->fc_type > RTN_MAX) {
3127                 NL_SET_ERR_MSG(extack, "Invalid route type");
3128                 goto out;
3129         }
3130
3131         if (cfg->fc_dst_len > 128) {
3132                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3133                 goto out;
3134         }
3135         if (cfg->fc_src_len > 128) {
3136                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3137                 goto out;
3138         }
3139 #ifndef CONFIG_IPV6_SUBTREES
3140         if (cfg->fc_src_len) {
3141                 NL_SET_ERR_MSG(extack,
3142                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3143                 goto out;
3144         }
3145 #endif
3146
3147         err = -ENOBUFS;
3148         if (cfg->fc_nlinfo.nlh &&
3149             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3150                 table = fib6_get_table(net, cfg->fc_table);
3151                 if (!table) {
3152                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3153                         table = fib6_new_table(net, cfg->fc_table);
3154                 }
3155         } else {
3156                 table = fib6_new_table(net, cfg->fc_table);
3157         }
3158
3159         if (!table)
3160                 goto out;
3161
3162         err = -ENOMEM;
3163         rt = fib6_info_alloc(gfp_flags);
3164         if (!rt)
3165                 goto out;
3166
3167         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3168                                                extack);
3169         if (IS_ERR(rt->fib6_metrics)) {
3170                 err = PTR_ERR(rt->fib6_metrics);
3171                 /* Do not leave garbage there. */
3172                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3173                 goto out;
3174         }
3175
3176         if (cfg->fc_flags & RTF_ADDRCONF)
3177                 rt->dst_nocount = true;
3178
3179         if (cfg->fc_flags & RTF_EXPIRES)
3180                 fib6_set_expires(rt, jiffies +
3181                                 clock_t_to_jiffies(cfg->fc_expires));
3182         else
3183                 fib6_clean_expires(rt);
3184
3185         if (cfg->fc_protocol == RTPROT_UNSPEC)
3186                 cfg->fc_protocol = RTPROT_BOOT;
3187         rt->fib6_protocol = cfg->fc_protocol;
3188
3189         rt->fib6_table = table;
3190         rt->fib6_metric = cfg->fc_metric;
3191         rt->fib6_type = cfg->fc_type;
3192         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3193
3194         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3195         rt->fib6_dst.plen = cfg->fc_dst_len;
3196         if (rt->fib6_dst.plen == 128)
3197                 rt->dst_host = true;
3198
3199 #ifdef CONFIG_IPV6_SUBTREES
3200         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3201         rt->fib6_src.plen = cfg->fc_src_len;
3202 #endif
3203         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3204         if (err)
3205                 goto out;
3206
3207         /* We cannot add true routes via loopback here,
3208          * they would result in kernel looping; promote them to reject routes
3209          */
3210         addr_type = ipv6_addr_type(&cfg->fc_dst);
3211         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3212                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3213
3214         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3215                 struct net_device *dev = fib6_info_nh_dev(rt);
3216
3217                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3218                         NL_SET_ERR_MSG(extack, "Invalid source address");
3219                         err = -EINVAL;
3220                         goto out;
3221                 }
3222                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3223                 rt->fib6_prefsrc.plen = 128;
3224         } else
3225                 rt->fib6_prefsrc.plen = 0;
3226
3227         return rt;
3228 out:
3229         fib6_info_release(rt);
3230         return ERR_PTR(err);
3231 }
3232
3233 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3234                   struct netlink_ext_ack *extack)
3235 {
3236         struct fib6_info *rt;
3237         int err;
3238
3239         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3240         if (IS_ERR(rt))
3241                 return PTR_ERR(rt);
3242
3243         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3244         fib6_info_release(rt);
3245
3246         return err;
3247 }
3248
3249 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3250 {
3251         struct net *net = info->nl_net;
3252         struct fib6_table *table;
3253         int err;
3254
3255         if (rt == net->ipv6.fib6_null_entry) {
3256                 err = -ENOENT;
3257                 goto out;
3258         }
3259
3260         table = rt->fib6_table;
3261         spin_lock_bh(&table->tb6_lock);
3262         err = fib6_del(rt, info);
3263         spin_unlock_bh(&table->tb6_lock);
3264
3265 out:
3266         fib6_info_release(rt);
3267         return err;
3268 }
3269
3270 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3271 {
3272         struct nl_info info = { .nl_net = net };
3273
3274         return __ip6_del_rt(rt, &info);
3275 }
3276
3277 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3278 {
3279         struct nl_info *info = &cfg->fc_nlinfo;
3280         struct net *net = info->nl_net;
3281         struct sk_buff *skb = NULL;
3282         struct fib6_table *table;
3283         int err = -ENOENT;
3284
3285         if (rt == net->ipv6.fib6_null_entry)
3286                 goto out_put;
3287         table = rt->fib6_table;
3288         spin_lock_bh(&table->tb6_lock);
3289
3290         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3291                 struct fib6_info *sibling, *next_sibling;
3292
3293                 /* prefer to send a single notification with all hops */
3294                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3295                 if (skb) {
3296                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3297
3298                         if (rt6_fill_node(net, skb, rt, NULL,
3299                                           NULL, NULL, 0, RTM_DELROUTE,
3300                                           info->portid, seq, 0) < 0) {
3301                                 kfree_skb(skb);
3302                                 skb = NULL;
3303                         } else
3304                                 info->skip_notify = 1;
3305                 }
3306
3307                 list_for_each_entry_safe(sibling, next_sibling,
3308                                          &rt->fib6_siblings,
3309                                          fib6_siblings) {
3310                         err = fib6_del(sibling, info);
3311                         if (err)
3312                                 goto out_unlock;
3313                 }
3314         }
3315
3316         err = fib6_del(rt, info);
3317 out_unlock:
3318         spin_unlock_bh(&table->tb6_lock);
3319 out_put:
3320         fib6_info_release(rt);
3321
3322         if (skb) {
3323                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3324                             info->nlh, gfp_any());
3325         }
3326         return err;
3327 }
3328
3329 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3330 {
3331         int rc = -ESRCH;
3332
3333         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3334                 goto out;
3335
3336         if (cfg->fc_flags & RTF_GATEWAY &&
3337             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3338                 goto out;
3339
3340         rc = rt6_remove_exception_rt(rt);
3341 out:
3342         return rc;
3343 }
3344
3345 static int ip6_route_del(struct fib6_config *cfg,
3346                          struct netlink_ext_ack *extack)
3347 {
3348         struct rt6_info *rt_cache;
3349         struct fib6_table *table;
3350         struct fib6_info *rt;
3351         struct fib6_node *fn;
3352         int err = -ESRCH;
3353
3354         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3355         if (!table) {
3356                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3357                 return err;
3358         }
3359
3360         rcu_read_lock();
3361
3362         fn = fib6_locate(&table->tb6_root,
3363                          &cfg->fc_dst, cfg->fc_dst_len,
3364                          &cfg->fc_src, cfg->fc_src_len,
3365                          !(cfg->fc_flags & RTF_CACHE));
3366
3367         if (fn) {
3368                 for_each_fib6_node_rt_rcu(fn) {
3369                         struct fib6_nh *nh;
3370
3371                         if (cfg->fc_flags & RTF_CACHE) {
3372                                 struct fib6_result res = {
3373                                         .f6i = rt,
3374                                 };
3375                                 int rc;
3376
3377                                 rt_cache = rt6_find_cached_rt(&res,
3378                                                               &cfg->fc_dst,
3379                                                               &cfg->fc_src);
3380                                 if (rt_cache) {
3381                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3382                                         if (rc != -ESRCH) {
3383                                                 rcu_read_unlock();
3384                                                 return rc;
3385                                         }
3386                                 }
3387                                 continue;
3388                         }
3389
3390                         nh = &rt->fib6_nh;
3391                         if (cfg->fc_ifindex &&
3392                             (!nh->fib_nh_dev ||
3393                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3394                                 continue;
3395                         if (cfg->fc_flags & RTF_GATEWAY &&
3396                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3397                                 continue;
3398                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3399                                 continue;
3400                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3401                                 continue;
3402                         if (!fib6_info_hold_safe(rt))
3403                                 continue;
3404                         rcu_read_unlock();
3405
3406                         /* if gateway was specified only delete the one hop */
3407                         if (cfg->fc_flags & RTF_GATEWAY)
3408                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3409
3410                         return __ip6_del_rt_siblings(rt, cfg);
3411                 }
3412         }
3413         rcu_read_unlock();
3414
3415         return err;
3416 }
3417
3418 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3419 {
3420         struct netevent_redirect netevent;
3421         struct rt6_info *rt, *nrt = NULL;
3422         struct fib6_result res = {};
3423         struct ndisc_options ndopts;
3424         struct inet6_dev *in6_dev;
3425         struct neighbour *neigh;
3426         struct rd_msg *msg;
3427         int optlen, on_link;
3428         u8 *lladdr;
3429
3430         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3431         optlen -= sizeof(*msg);
3432
3433         if (optlen < 0) {
3434                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3435                 return;
3436         }
3437
3438         msg = (struct rd_msg *)icmp6_hdr(skb);
3439
3440         if (ipv6_addr_is_multicast(&msg->dest)) {
3441                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3442                 return;
3443         }
3444
3445         on_link = 0;
3446         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3447                 on_link = 1;
3448         } else if (ipv6_addr_type(&msg->target) !=
3449                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3450                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3451                 return;
3452         }
3453
3454         in6_dev = __in6_dev_get(skb->dev);
3455         if (!in6_dev)
3456                 return;
3457         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3458                 return;
3459
3460         /* RFC2461 8.1:
3461          *      The IP source address of the Redirect MUST be the same as the current
3462          *      first-hop router for the specified ICMP Destination Address.
3463          */
3464
3465         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3466                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3467                 return;
3468         }
3469
3470         lladdr = NULL;
3471         if (ndopts.nd_opts_tgt_lladdr) {
3472                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3473                                              skb->dev);
3474                 if (!lladdr) {
3475                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3476                         return;
3477                 }
3478         }
3479
3480         rt = (struct rt6_info *) dst;
3481         if (rt->rt6i_flags & RTF_REJECT) {
3482                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3483                 return;
3484         }
3485
3486         /* Redirect received -> path was valid.
3487          * Look, redirects are sent only in response to data packets,
3488          * so that this nexthop apparently is reachable. --ANK
3489          */
3490         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3491
3492         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3493         if (!neigh)
3494                 return;
3495
3496         /*
3497          *      We have finally decided to accept it.
3498          */
3499
3500         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3501                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3502                      NEIGH_UPDATE_F_OVERRIDE|
3503                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3504                                      NEIGH_UPDATE_F_ISROUTER)),
3505                      NDISC_REDIRECT, &ndopts);
3506
3507         rcu_read_lock();
3508         res.f6i = rcu_dereference(rt->from);
3509         if (!res.f6i)
3510                 goto out;
3511
3512         res.nh = &res.f6i->fib6_nh;
3513         res.fib6_flags = res.f6i->fib6_flags;
3514         res.fib6_type = res.f6i->fib6_type;
3515         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3516         if (!nrt)
3517                 goto out;
3518
3519         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3520         if (on_link)
3521                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3522
3523         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3524
3525         /* rt6_insert_exception() will take care of duplicated exceptions */
3526         if (rt6_insert_exception(nrt, &res)) {
3527                 dst_release_immediate(&nrt->dst);
3528                 goto out;
3529         }
3530
3531         netevent.old = &rt->dst;
3532         netevent.new = &nrt->dst;
3533         netevent.daddr = &msg->dest;
3534         netevent.neigh = neigh;
3535         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3536
3537 out:
3538         rcu_read_unlock();
3539         neigh_release(neigh);
3540 }
3541
3542 #ifdef CONFIG_IPV6_ROUTE_INFO
3543 static struct fib6_info *rt6_get_route_info(struct net *net,
3544                                            const struct in6_addr *prefix, int prefixlen,
3545                                            const struct in6_addr *gwaddr,
3546                                            struct net_device *dev)
3547 {
3548         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3549         int ifindex = dev->ifindex;
3550         struct fib6_node *fn;
3551         struct fib6_info *rt = NULL;
3552         struct fib6_table *table;
3553
3554         table = fib6_get_table(net, tb_id);
3555         if (!table)
3556                 return NULL;
3557
3558         rcu_read_lock();
3559         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3560         if (!fn)
3561                 goto out;
3562
3563         for_each_fib6_node_rt_rcu(fn) {
3564                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3565                         continue;
3566                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3567                     !rt->fib6_nh.fib_nh_gw_family)
3568                         continue;
3569                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3570                         continue;
3571                 if (!fib6_info_hold_safe(rt))
3572                         continue;
3573                 break;
3574         }
3575 out:
3576         rcu_read_unlock();
3577         return rt;
3578 }
3579
3580 static struct fib6_info *rt6_add_route_info(struct net *net,
3581                                            const struct in6_addr *prefix, int prefixlen,
3582                                            const struct in6_addr *gwaddr,
3583                                            struct net_device *dev,
3584                                            unsigned int pref)
3585 {
3586         struct fib6_config cfg = {
3587                 .fc_metric      = IP6_RT_PRIO_USER,
3588                 .fc_ifindex     = dev->ifindex,
3589                 .fc_dst_len     = prefixlen,
3590                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3591                                   RTF_UP | RTF_PREF(pref),
3592                 .fc_protocol = RTPROT_RA,
3593                 .fc_type = RTN_UNICAST,
3594                 .fc_nlinfo.portid = 0,
3595                 .fc_nlinfo.nlh = NULL,
3596                 .fc_nlinfo.nl_net = net,
3597         };
3598
3599         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3600         cfg.fc_dst = *prefix;
3601         cfg.fc_gateway = *gwaddr;
3602
3603         /* We should treat it as a default route if prefix length is 0. */
3604         if (!prefixlen)
3605                 cfg.fc_flags |= RTF_DEFAULT;
3606
3607         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3608
3609         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3610 }
3611 #endif
3612
3613 struct fib6_info *rt6_get_dflt_router(struct net *net,
3614                                      const struct in6_addr *addr,
3615                                      struct net_device *dev)
3616 {
3617         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3618         struct fib6_info *rt;
3619         struct fib6_table *table;
3620
3621         table = fib6_get_table(net, tb_id);
3622         if (!table)
3623                 return NULL;
3624
3625         rcu_read_lock();
3626         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3627                 struct fib6_nh *nh = &rt->fib6_nh;
3628
3629                 if (dev == nh->fib_nh_dev &&
3630                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3631                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3632                         break;
3633         }
3634         if (rt && !fib6_info_hold_safe(rt))
3635                 rt = NULL;
3636         rcu_read_unlock();
3637         return rt;
3638 }
3639
3640 struct fib6_info *rt6_add_dflt_router(struct net *net,
3641                                      const struct in6_addr *gwaddr,
3642                                      struct net_device *dev,
3643                                      unsigned int pref)
3644 {
3645         struct fib6_config cfg = {
3646                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3647                 .fc_metric      = IP6_RT_PRIO_USER,
3648                 .fc_ifindex     = dev->ifindex,
3649                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3650                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3651                 .fc_protocol = RTPROT_RA,
3652                 .fc_type = RTN_UNICAST,
3653                 .fc_nlinfo.portid = 0,
3654                 .fc_nlinfo.nlh = NULL,
3655                 .fc_nlinfo.nl_net = net,
3656         };
3657
3658         cfg.fc_gateway = *gwaddr;
3659
3660         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3661                 struct fib6_table *table;
3662
3663                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3664                 if (table)
3665                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3666         }
3667
3668         return rt6_get_dflt_router(net, gwaddr, dev);
3669 }
3670
3671 static void __rt6_purge_dflt_routers(struct net *net,
3672                                      struct fib6_table *table)
3673 {
3674         struct fib6_info *rt;
3675
3676 restart:
3677         rcu_read_lock();
3678         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3679                 struct net_device *dev = fib6_info_nh_dev(rt);
3680                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3681
3682                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3683                     (!idev || idev->cnf.accept_ra != 2) &&
3684                     fib6_info_hold_safe(rt)) {
3685                         rcu_read_unlock();
3686                         ip6_del_rt(net, rt);
3687                         goto restart;
3688                 }
3689         }
3690         rcu_read_unlock();
3691
3692         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3693 }
3694
3695 void rt6_purge_dflt_routers(struct net *net)
3696 {
3697         struct fib6_table *table;
3698         struct hlist_head *head;
3699         unsigned int h;
3700
3701         rcu_read_lock();
3702
3703         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3704                 head = &net->ipv6.fib_table_hash[h];
3705                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3706                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3707                                 __rt6_purge_dflt_routers(net, table);
3708                 }
3709         }
3710
3711         rcu_read_unlock();
3712 }
3713
3714 static void rtmsg_to_fib6_config(struct net *net,
3715                                  struct in6_rtmsg *rtmsg,
3716                                  struct fib6_config *cfg)
3717 {
3718         *cfg = (struct fib6_config){
3719                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3720                          : RT6_TABLE_MAIN,
3721                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3722                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3723                 .fc_expires = rtmsg->rtmsg_info,
3724                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3725                 .fc_src_len = rtmsg->rtmsg_src_len,
3726                 .fc_flags = rtmsg->rtmsg_flags,
3727                 .fc_type = rtmsg->rtmsg_type,
3728
3729                 .fc_nlinfo.nl_net = net,
3730
3731                 .fc_dst = rtmsg->rtmsg_dst,
3732                 .fc_src = rtmsg->rtmsg_src,
3733                 .fc_gateway = rtmsg->rtmsg_gateway,
3734         };
3735 }
3736
3737 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3738 {
3739         struct fib6_config cfg;
3740         struct in6_rtmsg rtmsg;
3741         int err;
3742
3743         switch (cmd) {
3744         case SIOCADDRT:         /* Add a route */
3745         case SIOCDELRT:         /* Delete a route */
3746                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3747                         return -EPERM;
3748                 err = copy_from_user(&rtmsg, arg,
3749                                      sizeof(struct in6_rtmsg));
3750                 if (err)
3751                         return -EFAULT;
3752
3753                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3754
3755                 rtnl_lock();
3756                 switch (cmd) {
3757                 case SIOCADDRT:
3758                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3759                         break;
3760                 case SIOCDELRT:
3761                         err = ip6_route_del(&cfg, NULL);
3762                         break;
3763                 default:
3764                         err = -EINVAL;
3765                 }
3766                 rtnl_unlock();
3767
3768                 return err;
3769         }
3770
3771         return -EINVAL;
3772 }
3773
3774 /*
3775  *      Drop the packet on the floor
3776  */
3777
3778 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3779 {
3780         struct dst_entry *dst = skb_dst(skb);
3781         struct net *net = dev_net(dst->dev);
3782         struct inet6_dev *idev;
3783         int type;
3784
3785         if (netif_is_l3_master(skb->dev) &&
3786             dst->dev == net->loopback_dev)
3787                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3788         else
3789                 idev = ip6_dst_idev(dst);
3790
3791         switch (ipstats_mib_noroutes) {
3792         case IPSTATS_MIB_INNOROUTES:
3793                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3794                 if (type == IPV6_ADDR_ANY) {
3795                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3796                         break;
3797                 }
3798                 /* FALLTHROUGH */
3799         case IPSTATS_MIB_OUTNOROUTES:
3800                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3801                 break;
3802         }
3803
3804         /* Start over by dropping the dst for l3mdev case */
3805         if (netif_is_l3_master(skb->dev))
3806                 skb_dst_drop(skb);
3807
3808         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3809         kfree_skb(skb);
3810         return 0;
3811 }
3812
3813 static int ip6_pkt_discard(struct sk_buff *skb)
3814 {
3815         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3816 }
3817
3818 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3819 {
3820         skb->dev = skb_dst(skb)->dev;
3821         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3822 }
3823
3824 static int ip6_pkt_prohibit(struct sk_buff *skb)
3825 {
3826         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3827 }
3828
3829 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3830 {
3831         skb->dev = skb_dst(skb)->dev;
3832         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3833 }
3834
3835 /*
3836  *      Allocate a dst for local (unicast / anycast) address.
3837  */
3838
3839 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3840                                      struct inet6_dev *idev,
3841                                      const struct in6_addr *addr,
3842                                      bool anycast, gfp_t gfp_flags)
3843 {
3844         struct fib6_config cfg = {
3845                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3846                 .fc_ifindex = idev->dev->ifindex,
3847                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3848                 .fc_dst = *addr,
3849                 .fc_dst_len = 128,
3850                 .fc_protocol = RTPROT_KERNEL,
3851                 .fc_nlinfo.nl_net = net,
3852                 .fc_ignore_dev_down = true,
3853         };
3854
3855         if (anycast) {
3856                 cfg.fc_type = RTN_ANYCAST;
3857                 cfg.fc_flags |= RTF_ANYCAST;
3858         } else {
3859                 cfg.fc_type = RTN_LOCAL;
3860                 cfg.fc_flags |= RTF_LOCAL;
3861         }
3862
3863         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3864 }
3865
3866 /* remove deleted ip from prefsrc entries */
3867 struct arg_dev_net_ip {
3868         struct net_device *dev;
3869         struct net *net;
3870         struct in6_addr *addr;
3871 };
3872
3873 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3874 {
3875         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3876         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3877         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3878
3879         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3880             rt != net->ipv6.fib6_null_entry &&
3881             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3882                 spin_lock_bh(&rt6_exception_lock);
3883                 /* remove prefsrc entry */
3884                 rt->fib6_prefsrc.plen = 0;
3885                 spin_unlock_bh(&rt6_exception_lock);
3886         }
3887         return 0;
3888 }
3889
3890 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3891 {
3892         struct net *net = dev_net(ifp->idev->dev);
3893         struct arg_dev_net_ip adni = {
3894                 .dev = ifp->idev->dev,
3895                 .net = net,
3896                 .addr = &ifp->addr,
3897         };
3898         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3899 }
3900
3901 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3902
3903 /* Remove routers and update dst entries when gateway turn into host. */
3904 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3905 {
3906         struct in6_addr *gateway = (struct in6_addr *)arg;
3907
3908         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3909             rt->fib6_nh.fib_nh_gw_family &&
3910             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3911                 return -1;
3912         }
3913
3914         /* Further clean up cached routes in exception table.
3915          * This is needed because cached route may have a different
3916          * gateway than its 'parent' in the case of an ip redirect.
3917          */
3918         rt6_exceptions_clean_tohost(rt, gateway);
3919
3920         return 0;
3921 }
3922
3923 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3924 {
3925         fib6_clean_all(net, fib6_clean_tohost, gateway);
3926 }
3927
3928 struct arg_netdev_event {
3929         const struct net_device *dev;
3930         union {
3931                 unsigned char nh_flags;
3932                 unsigned long event;
3933         };
3934 };
3935
3936 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3937 {
3938         struct fib6_info *iter;
3939         struct fib6_node *fn;
3940
3941         fn = rcu_dereference_protected(rt->fib6_node,
3942                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3943         iter = rcu_dereference_protected(fn->leaf,
3944                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3945         while (iter) {
3946                 if (iter->fib6_metric == rt->fib6_metric &&
3947                     rt6_qualify_for_ecmp(iter))
3948                         return iter;
3949                 iter = rcu_dereference_protected(iter->fib6_next,
3950                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3951         }
3952
3953         return NULL;
3954 }
3955
3956 static bool rt6_is_dead(const struct fib6_info *rt)
3957 {
3958         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3959             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3960              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3961                 return true;
3962
3963         return false;
3964 }
3965
3966 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3967 {
3968         struct fib6_info *iter;
3969         int total = 0;
3970
3971         if (!rt6_is_dead(rt))
3972                 total += rt->fib6_nh.fib_nh_weight;
3973
3974         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3975                 if (!rt6_is_dead(iter))
3976                         total += iter->fib6_nh.fib_nh_weight;
3977         }
3978
3979         return total;
3980 }
3981
3982 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3983 {
3984         int upper_bound = -1;
3985
3986         if (!rt6_is_dead(rt)) {
3987                 *weight += rt->fib6_nh.fib_nh_weight;
3988                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3989                                                     total) - 1;
3990         }
3991         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3992 }
3993
3994 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3995 {
3996         struct fib6_info *iter;
3997         int weight = 0;
3998
3999         rt6_upper_bound_set(rt, &weight, total);
4000
4001         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4002                 rt6_upper_bound_set(iter, &weight, total);
4003 }
4004
4005 void rt6_multipath_rebalance(struct fib6_info *rt)
4006 {
4007         struct fib6_info *first;
4008         int total;
4009
4010         /* In case the entire multipath route was marked for flushing,
4011          * then there is no need to rebalance upon the removal of every
4012          * sibling route.
4013          */
4014         if (!rt->fib6_nsiblings || rt->should_flush)
4015                 return;
4016
4017         /* During lookup routes are evaluated in order, so we need to
4018          * make sure upper bounds are assigned from the first sibling
4019          * onwards.
4020          */
4021         first = rt6_multipath_first_sibling(rt);
4022         if (WARN_ON_ONCE(!first))
4023                 return;
4024
4025         total = rt6_multipath_total_weight(first);
4026         rt6_multipath_upper_bound_set(first, total);
4027 }
4028
4029 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4030 {
4031         const struct arg_netdev_event *arg = p_arg;
4032         struct net *net = dev_net(arg->dev);
4033
4034         if (rt != net->ipv6.fib6_null_entry &&
4035             rt->fib6_nh.fib_nh_dev == arg->dev) {
4036                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4037                 fib6_update_sernum_upto_root(net, rt);
4038                 rt6_multipath_rebalance(rt);
4039         }
4040
4041         return 0;
4042 }
4043
4044 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4045 {
4046         struct arg_netdev_event arg = {
4047                 .dev = dev,
4048                 {
4049                         .nh_flags = nh_flags,
4050                 },
4051         };
4052
4053         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4054                 arg.nh_flags |= RTNH_F_LINKDOWN;
4055
4056         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4057 }
4058
4059 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4060                                    const struct net_device *dev)
4061 {
4062         struct fib6_info *iter;
4063
4064         if (rt->fib6_nh.fib_nh_dev == dev)
4065                 return true;
4066         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4067                 if (iter->fib6_nh.fib_nh_dev == dev)
4068                         return true;
4069
4070         return false;
4071 }
4072
4073 static void rt6_multipath_flush(struct fib6_info *rt)
4074 {
4075         struct fib6_info *iter;
4076
4077         rt->should_flush = 1;
4078         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4079                 iter->should_flush = 1;
4080 }
4081
4082 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4083                                              const struct net_device *down_dev)
4084 {
4085         struct fib6_info *iter;
4086         unsigned int dead = 0;
4087
4088         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4089             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4090                 dead++;
4091         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4093                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4094                         dead++;
4095
4096         return dead;
4097 }
4098
4099 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4100                                        const struct net_device *dev,
4101                                        unsigned char nh_flags)
4102 {
4103         struct fib6_info *iter;
4104
4105         if (rt->fib6_nh.fib_nh_dev == dev)
4106                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4107         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4108                 if (iter->fib6_nh.fib_nh_dev == dev)
4109                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4110 }
4111
4112 /* called with write lock held for table with rt */
4113 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4114 {
4115         const struct arg_netdev_event *arg = p_arg;
4116         const struct net_device *dev = arg->dev;
4117         struct net *net = dev_net(dev);
4118
4119         if (rt == net->ipv6.fib6_null_entry)
4120                 return 0;
4121
4122         switch (arg->event) {
4123         case NETDEV_UNREGISTER:
4124                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4125         case NETDEV_DOWN:
4126                 if (rt->should_flush)
4127                         return -1;
4128                 if (!rt->fib6_nsiblings)
4129                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4130                 if (rt6_multipath_uses_dev(rt, dev)) {
4131                         unsigned int count;
4132
4133                         count = rt6_multipath_dead_count(rt, dev);
4134                         if (rt->fib6_nsiblings + 1 == count) {
4135                                 rt6_multipath_flush(rt);
4136                                 return -1;
4137                         }
4138                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4139                                                    RTNH_F_LINKDOWN);
4140                         fib6_update_sernum(net, rt);
4141                         rt6_multipath_rebalance(rt);
4142                 }
4143                 return -2;
4144         case NETDEV_CHANGE:
4145                 if (rt->fib6_nh.fib_nh_dev != dev ||
4146                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4147                         break;
4148                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4149                 rt6_multipath_rebalance(rt);
4150                 break;
4151         }
4152
4153         return 0;
4154 }
4155
4156 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4157 {
4158         struct arg_netdev_event arg = {
4159                 .dev = dev,
4160                 {
4161                         .event = event,
4162                 },
4163         };
4164         struct net *net = dev_net(dev);
4165
4166         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4167                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4168         else
4169                 fib6_clean_all(net, fib6_ifdown, &arg);
4170 }
4171
4172 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4173 {
4174         rt6_sync_down_dev(dev, event);
4175         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4176         neigh_ifdown(&nd_tbl, dev);
4177 }
4178
4179 struct rt6_mtu_change_arg {
4180         struct net_device *dev;
4181         unsigned int mtu;
4182 };
4183
4184 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4185 {
4186         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4187         struct inet6_dev *idev;
4188
4189         /* In IPv6 pmtu discovery is not optional,
4190            so that RTAX_MTU lock cannot disable it.
4191            We still use this lock to block changes
4192            caused by addrconf/ndisc.
4193         */
4194
4195         idev = __in6_dev_get(arg->dev);
4196         if (!idev)
4197                 return 0;
4198
4199         /* For administrative MTU increase, there is no way to discover
4200            IPv6 PMTU increase, so PMTU increase should be updated here.
4201            Since RFC 1981 doesn't include administrative MTU increase
4202            update PMTU increase is a MUST. (i.e. jumbo frame)
4203          */
4204         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4205             !fib6_metric_locked(rt, RTAX_MTU)) {
4206                 u32 mtu = rt->fib6_pmtu;
4207
4208                 if (mtu >= arg->mtu ||
4209                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4210                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4211
4212                 spin_lock_bh(&rt6_exception_lock);
4213                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4214                 spin_unlock_bh(&rt6_exception_lock);
4215         }
4216         return 0;
4217 }
4218
4219 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4220 {
4221         struct rt6_mtu_change_arg arg = {
4222                 .dev = dev,
4223                 .mtu = mtu,
4224         };
4225
4226         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4227 }
4228
4229 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4230         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4231         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4232         [RTA_OIF]               = { .type = NLA_U32 },
4233         [RTA_IIF]               = { .type = NLA_U32 },
4234         [RTA_PRIORITY]          = { .type = NLA_U32 },
4235         [RTA_METRICS]           = { .type = NLA_NESTED },
4236         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4237         [RTA_PREF]              = { .type = NLA_U8 },
4238         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4239         [RTA_ENCAP]             = { .type = NLA_NESTED },
4240         [RTA_EXPIRES]           = { .type = NLA_U32 },
4241         [RTA_UID]               = { .type = NLA_U32 },
4242         [RTA_MARK]              = { .type = NLA_U32 },
4243         [RTA_TABLE]             = { .type = NLA_U32 },
4244         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4245         [RTA_SPORT]             = { .type = NLA_U16 },
4246         [RTA_DPORT]             = { .type = NLA_U16 },
4247 };
4248
4249 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4250                               struct fib6_config *cfg,
4251                               struct netlink_ext_ack *extack)
4252 {
4253         struct rtmsg *rtm;
4254         struct nlattr *tb[RTA_MAX+1];
4255         unsigned int pref;
4256         int err;
4257
4258         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4259                                      rtm_ipv6_policy, extack);
4260         if (err < 0)
4261                 goto errout;
4262
4263         err = -EINVAL;
4264         rtm = nlmsg_data(nlh);
4265
4266         *cfg = (struct fib6_config){
4267                 .fc_table = rtm->rtm_table,
4268                 .fc_dst_len = rtm->rtm_dst_len,
4269                 .fc_src_len = rtm->rtm_src_len,
4270                 .fc_flags = RTF_UP,
4271                 .fc_protocol = rtm->rtm_protocol,
4272                 .fc_type = rtm->rtm_type,
4273
4274                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4275                 .fc_nlinfo.nlh = nlh,
4276                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4277         };
4278
4279         if (rtm->rtm_type == RTN_UNREACHABLE ||
4280             rtm->rtm_type == RTN_BLACKHOLE ||
4281             rtm->rtm_type == RTN_PROHIBIT ||
4282             rtm->rtm_type == RTN_THROW)
4283                 cfg->fc_flags |= RTF_REJECT;
4284
4285         if (rtm->rtm_type == RTN_LOCAL)
4286                 cfg->fc_flags |= RTF_LOCAL;
4287
4288         if (rtm->rtm_flags & RTM_F_CLONED)
4289                 cfg->fc_flags |= RTF_CACHE;
4290
4291         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4292
4293         if (tb[RTA_GATEWAY]) {
4294                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4295                 cfg->fc_flags |= RTF_GATEWAY;
4296         }
4297         if (tb[RTA_VIA]) {
4298                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4299                 goto errout;
4300         }
4301
4302         if (tb[RTA_DST]) {
4303                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4304
4305                 if (nla_len(tb[RTA_DST]) < plen)
4306                         goto errout;
4307
4308                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4309         }
4310
4311         if (tb[RTA_SRC]) {
4312                 int plen = (rtm->rtm_src_len + 7) >> 3;
4313
4314                 if (nla_len(tb[RTA_SRC]) < plen)
4315                         goto errout;
4316
4317                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4318         }
4319
4320         if (tb[RTA_PREFSRC])
4321                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4322
4323         if (tb[RTA_OIF])
4324                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4325
4326         if (tb[RTA_PRIORITY])
4327                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4328
4329         if (tb[RTA_METRICS]) {
4330                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4331                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4332         }
4333
4334         if (tb[RTA_TABLE])
4335                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4336
4337         if (tb[RTA_MULTIPATH]) {
4338                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4339                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4340
4341                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4342                                                      cfg->fc_mp_len, extack);
4343                 if (err < 0)
4344                         goto errout;
4345         }
4346
4347         if (tb[RTA_PREF]) {
4348                 pref = nla_get_u8(tb[RTA_PREF]);
4349                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4350                     pref != ICMPV6_ROUTER_PREF_HIGH)
4351                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4352                 cfg->fc_flags |= RTF_PREF(pref);
4353         }
4354
4355         if (tb[RTA_ENCAP])
4356                 cfg->fc_encap = tb[RTA_ENCAP];
4357
4358         if (tb[RTA_ENCAP_TYPE]) {
4359                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4360
4361                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4362                 if (err < 0)
4363                         goto errout;
4364         }
4365
4366         if (tb[RTA_EXPIRES]) {
4367                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4368
4369                 if (addrconf_finite_timeout(timeout)) {
4370                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4371                         cfg->fc_flags |= RTF_EXPIRES;
4372                 }
4373         }
4374
4375         err = 0;
4376 errout:
4377         return err;
4378 }
4379
4380 struct rt6_nh {
4381         struct fib6_info *fib6_info;
4382         struct fib6_config r_cfg;
4383         struct list_head next;
4384 };
4385
4386 static int ip6_route_info_append(struct net *net,
4387                                  struct list_head *rt6_nh_list,
4388                                  struct fib6_info *rt,
4389                                  struct fib6_config *r_cfg)
4390 {
4391         struct rt6_nh *nh;
4392         int err = -EEXIST;
4393
4394         list_for_each_entry(nh, rt6_nh_list, next) {
4395                 /* check if fib6_info already exists */
4396                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4397                         return err;
4398         }
4399
4400         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4401         if (!nh)
4402                 return -ENOMEM;
4403         nh->fib6_info = rt;
4404         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4405         list_add_tail(&nh->next, rt6_nh_list);
4406
4407         return 0;
4408 }
4409
4410 static void ip6_route_mpath_notify(struct fib6_info *rt,
4411                                    struct fib6_info *rt_last,
4412                                    struct nl_info *info,
4413                                    __u16 nlflags)
4414 {
4415         /* if this is an APPEND route, then rt points to the first route
4416          * inserted and rt_last points to last route inserted. Userspace
4417          * wants a consistent dump of the route which starts at the first
4418          * nexthop. Since sibling routes are always added at the end of
4419          * the list, find the first sibling of the last route appended
4420          */
4421         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4422                 rt = list_first_entry(&rt_last->fib6_siblings,
4423                                       struct fib6_info,
4424                                       fib6_siblings);
4425         }
4426
4427         if (rt)
4428                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4429 }
4430
4431 static int ip6_route_multipath_add(struct fib6_config *cfg,
4432                                    struct netlink_ext_ack *extack)
4433 {
4434         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4435         struct nl_info *info = &cfg->fc_nlinfo;
4436         struct fib6_config r_cfg;
4437         struct rtnexthop *rtnh;
4438         struct fib6_info *rt;
4439         struct rt6_nh *err_nh;
4440         struct rt6_nh *nh, *nh_safe;
4441         __u16 nlflags;
4442         int remaining;
4443         int attrlen;
4444         int err = 1;
4445         int nhn = 0;
4446         int replace = (cfg->fc_nlinfo.nlh &&
4447                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4448         LIST_HEAD(rt6_nh_list);
4449
4450         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4451         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4452                 nlflags |= NLM_F_APPEND;
4453
4454         remaining = cfg->fc_mp_len;
4455         rtnh = (struct rtnexthop *)cfg->fc_mp;
4456
4457         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4458          * fib6_info structs per nexthop
4459          */
4460         while (rtnh_ok(rtnh, remaining)) {
4461                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4462                 if (rtnh->rtnh_ifindex)
4463                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4464
4465                 attrlen = rtnh_attrlen(rtnh);
4466                 if (attrlen > 0) {
4467                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4468
4469                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4470                         if (nla) {
4471                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4472                                 r_cfg.fc_flags |= RTF_GATEWAY;
4473                         }
4474                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4475                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4476                         if (nla)
4477                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4478                 }
4479
4480                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4481                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4482                 if (IS_ERR(rt)) {
4483                         err = PTR_ERR(rt);
4484                         rt = NULL;
4485                         goto cleanup;
4486                 }
4487                 if (!rt6_qualify_for_ecmp(rt)) {
4488                         err = -EINVAL;
4489                         NL_SET_ERR_MSG(extack,
4490                                        "Device only routes can not be added for IPv6 using the multipath API.");
4491                         fib6_info_release(rt);
4492                         goto cleanup;
4493                 }
4494
4495                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4496
4497                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4498                                             rt, &r_cfg);
4499                 if (err) {
4500                         fib6_info_release(rt);
4501                         goto cleanup;
4502                 }
4503
4504                 rtnh = rtnh_next(rtnh, &remaining);
4505         }
4506
4507         /* for add and replace send one notification with all nexthops.
4508          * Skip the notification in fib6_add_rt2node and send one with
4509          * the full route when done
4510          */
4511         info->skip_notify = 1;
4512
4513         err_nh = NULL;
4514         list_for_each_entry(nh, &rt6_nh_list, next) {
4515                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4516                 fib6_info_release(nh->fib6_info);
4517
4518                 if (!err) {
4519                         /* save reference to last route successfully inserted */
4520                         rt_last = nh->fib6_info;
4521
4522                         /* save reference to first route for notification */
4523                         if (!rt_notif)
4524                                 rt_notif = nh->fib6_info;
4525                 }
4526
4527                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4528                 nh->fib6_info = NULL;
4529                 if (err) {
4530                         if (replace && nhn)
4531                                 NL_SET_ERR_MSG_MOD(extack,
4532                                                    "multipath route replace failed (check consistency of installed routes)");
4533                         err_nh = nh;
4534                         goto add_errout;
4535                 }
4536
4537                 /* Because each route is added like a single route we remove
4538                  * these flags after the first nexthop: if there is a collision,
4539                  * we have already failed to add the first nexthop:
4540                  * fib6_add_rt2node() has rejected it; when replacing, old
4541                  * nexthops have been replaced by first new, the rest should
4542                  * be added to it.
4543                  */
4544                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4545                                                      NLM_F_REPLACE);
4546                 nhn++;
4547         }
4548
4549         /* success ... tell user about new route */
4550         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4551         goto cleanup;
4552
4553 add_errout:
4554         /* send notification for routes that were added so that
4555          * the delete notifications sent by ip6_route_del are
4556          * coherent
4557          */
4558         if (rt_notif)
4559                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4560
4561         /* Delete routes that were already added */
4562         list_for_each_entry(nh, &rt6_nh_list, next) {
4563                 if (err_nh == nh)
4564                         break;
4565                 ip6_route_del(&nh->r_cfg, extack);
4566         }
4567
4568 cleanup:
4569         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4570                 if (nh->fib6_info)
4571                         fib6_info_release(nh->fib6_info);
4572                 list_del(&nh->next);
4573                 kfree(nh);
4574         }
4575
4576         return err;
4577 }
4578
4579 static int ip6_route_multipath_del(struct fib6_config *cfg,
4580                                    struct netlink_ext_ack *extack)
4581 {
4582         struct fib6_config r_cfg;
4583         struct rtnexthop *rtnh;
4584         int remaining;
4585         int attrlen;
4586         int err = 1, last_err = 0;
4587
4588         remaining = cfg->fc_mp_len;
4589         rtnh = (struct rtnexthop *)cfg->fc_mp;
4590
4591         /* Parse a Multipath Entry */
4592         while (rtnh_ok(rtnh, remaining)) {
4593                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4594                 if (rtnh->rtnh_ifindex)
4595                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4596
4597                 attrlen = rtnh_attrlen(rtnh);
4598                 if (attrlen > 0) {
4599                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4600
4601                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4602                         if (nla) {
4603                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4604                                 r_cfg.fc_flags |= RTF_GATEWAY;
4605                         }
4606                 }
4607                 err = ip6_route_del(&r_cfg, extack);
4608                 if (err)
4609                         last_err = err;
4610
4611                 rtnh = rtnh_next(rtnh, &remaining);
4612         }
4613
4614         return last_err;
4615 }
4616
4617 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4618                               struct netlink_ext_ack *extack)
4619 {
4620         struct fib6_config cfg;
4621         int err;
4622
4623         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4624         if (err < 0)
4625                 return err;
4626
4627         if (cfg.fc_mp)
4628                 return ip6_route_multipath_del(&cfg, extack);
4629         else {
4630                 cfg.fc_delete_all_nh = 1;
4631                 return ip6_route_del(&cfg, extack);
4632         }
4633 }
4634
4635 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4636                               struct netlink_ext_ack *extack)
4637 {
4638         struct fib6_config cfg;
4639         int err;
4640
4641         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4642         if (err < 0)
4643                 return err;
4644
4645         if (cfg.fc_metric == 0)
4646                 cfg.fc_metric = IP6_RT_PRIO_USER;
4647
4648         if (cfg.fc_mp)
4649                 return ip6_route_multipath_add(&cfg, extack);
4650         else
4651                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4652 }
4653
4654 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4655 {
4656         int nexthop_len = 0;
4657
4658         if (rt->fib6_nsiblings) {
4659                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4660                             + NLA_ALIGN(sizeof(struct rtnexthop))
4661                             + nla_total_size(16) /* RTA_GATEWAY */
4662                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4663
4664                 nexthop_len *= rt->fib6_nsiblings;
4665         }
4666
4667         return NLMSG_ALIGN(sizeof(struct rtmsg))
4668                + nla_total_size(16) /* RTA_SRC */
4669                + nla_total_size(16) /* RTA_DST */
4670                + nla_total_size(16) /* RTA_GATEWAY */
4671                + nla_total_size(16) /* RTA_PREFSRC */
4672                + nla_total_size(4) /* RTA_TABLE */
4673                + nla_total_size(4) /* RTA_IIF */
4674                + nla_total_size(4) /* RTA_OIF */
4675                + nla_total_size(4) /* RTA_PRIORITY */
4676                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4677                + nla_total_size(sizeof(struct rta_cacheinfo))
4678                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4679                + nla_total_size(1) /* RTA_PREF */
4680                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4681                + nexthop_len;
4682 }
4683
4684 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4685                          struct fib6_info *rt, struct dst_entry *dst,
4686                          struct in6_addr *dest, struct in6_addr *src,
4687                          int iif, int type, u32 portid, u32 seq,
4688                          unsigned int flags)
4689 {
4690         struct rt6_info *rt6 = (struct rt6_info *)dst;
4691         struct rt6key *rt6_dst, *rt6_src;
4692         u32 *pmetrics, table, rt6_flags;
4693         struct nlmsghdr *nlh;
4694         struct rtmsg *rtm;
4695         long expires = 0;
4696
4697         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4698         if (!nlh)
4699                 return -EMSGSIZE;
4700
4701         if (rt6) {
4702                 rt6_dst = &rt6->rt6i_dst;
4703                 rt6_src = &rt6->rt6i_src;
4704                 rt6_flags = rt6->rt6i_flags;
4705         } else {
4706                 rt6_dst = &rt->fib6_dst;
4707                 rt6_src = &rt->fib6_src;
4708                 rt6_flags = rt->fib6_flags;
4709         }
4710
4711         rtm = nlmsg_data(nlh);
4712         rtm->rtm_family = AF_INET6;
4713         rtm->rtm_dst_len = rt6_dst->plen;
4714         rtm->rtm_src_len = rt6_src->plen;
4715         rtm->rtm_tos = 0;
4716         if (rt->fib6_table)
4717                 table = rt->fib6_table->tb6_id;
4718         else
4719                 table = RT6_TABLE_UNSPEC;
4720         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4721         if (nla_put_u32(skb, RTA_TABLE, table))
4722                 goto nla_put_failure;
4723
4724         rtm->rtm_type = rt->fib6_type;
4725         rtm->rtm_flags = 0;
4726         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4727         rtm->rtm_protocol = rt->fib6_protocol;
4728
4729         if (rt6_flags & RTF_CACHE)
4730                 rtm->rtm_flags |= RTM_F_CLONED;
4731
4732         if (dest) {
4733                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4734                         goto nla_put_failure;
4735                 rtm->rtm_dst_len = 128;
4736         } else if (rtm->rtm_dst_len)
4737                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4738                         goto nla_put_failure;
4739 #ifdef CONFIG_IPV6_SUBTREES
4740         if (src) {
4741                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4742                         goto nla_put_failure;
4743                 rtm->rtm_src_len = 128;
4744         } else if (rtm->rtm_src_len &&
4745                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4746                 goto nla_put_failure;
4747 #endif
4748         if (iif) {
4749 #ifdef CONFIG_IPV6_MROUTE
4750                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4751                         int err = ip6mr_get_route(net, skb, rtm, portid);
4752
4753                         if (err == 0)
4754                                 return 0;
4755                         if (err < 0)
4756                                 goto nla_put_failure;
4757                 } else
4758 #endif
4759                         if (nla_put_u32(skb, RTA_IIF, iif))
4760                                 goto nla_put_failure;
4761         } else if (dest) {
4762                 struct in6_addr saddr_buf;
4763                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4764                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4765                         goto nla_put_failure;
4766         }
4767
4768         if (rt->fib6_prefsrc.plen) {
4769                 struct in6_addr saddr_buf;
4770                 saddr_buf = rt->fib6_prefsrc.addr;
4771                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4772                         goto nla_put_failure;
4773         }
4774
4775         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4776         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4777                 goto nla_put_failure;
4778
4779         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4780                 goto nla_put_failure;
4781
4782         /* For multipath routes, walk the siblings list and add
4783          * each as a nexthop within RTA_MULTIPATH.
4784          */
4785         if (rt6) {
4786                 if (rt6_flags & RTF_GATEWAY &&
4787                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4788                         goto nla_put_failure;
4789
4790                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4791                         goto nla_put_failure;
4792         } else if (rt->fib6_nsiblings) {
4793                 struct fib6_info *sibling, *next_sibling;
4794                 struct nlattr *mp;
4795
4796                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4797                 if (!mp)
4798                         goto nla_put_failure;
4799
4800                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4801                                     rt->fib6_nh.fib_nh_weight) < 0)
4802                         goto nla_put_failure;
4803
4804                 list_for_each_entry_safe(sibling, next_sibling,
4805                                          &rt->fib6_siblings, fib6_siblings) {
4806                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4807                                             sibling->fib6_nh.fib_nh_weight) < 0)
4808                                 goto nla_put_failure;
4809                 }
4810
4811                 nla_nest_end(skb, mp);
4812         } else {
4813                 unsigned char nh_flags = 0;
4814
4815                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4816                                      &nh_flags, false) < 0)
4817                         goto nla_put_failure;
4818
4819                 rtm->rtm_flags |= nh_flags;
4820         }
4821
4822         if (rt6_flags & RTF_EXPIRES) {
4823                 expires = dst ? dst->expires : rt->expires;
4824                 expires -= jiffies;
4825         }
4826
4827         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4828                 goto nla_put_failure;
4829
4830         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4831                 goto nla_put_failure;
4832
4833
4834         nlmsg_end(skb, nlh);
4835         return 0;
4836
4837 nla_put_failure:
4838         nlmsg_cancel(skb, nlh);
4839         return -EMSGSIZE;
4840 }
4841
4842 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4843                                const struct net_device *dev)
4844 {
4845         if (f6i->fib6_nh.fib_nh_dev == dev)
4846                 return true;
4847
4848         if (f6i->fib6_nsiblings) {
4849                 struct fib6_info *sibling, *next_sibling;
4850
4851                 list_for_each_entry_safe(sibling, next_sibling,
4852                                          &f6i->fib6_siblings, fib6_siblings) {
4853                         if (sibling->fib6_nh.fib_nh_dev == dev)
4854                                 return true;
4855                 }
4856         }
4857
4858         return false;
4859 }
4860
4861 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4862 {
4863         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4864         struct fib_dump_filter *filter = &arg->filter;
4865         unsigned int flags = NLM_F_MULTI;
4866         struct net *net = arg->net;
4867
4868         if (rt == net->ipv6.fib6_null_entry)
4869                 return 0;
4870
4871         if ((filter->flags & RTM_F_PREFIX) &&
4872             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4873                 /* success since this is not a prefix route */
4874                 return 1;
4875         }
4876         if (filter->filter_set) {
4877                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4878                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4879                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4880                         return 1;
4881                 }
4882                 flags |= NLM_F_DUMP_FILTERED;
4883         }
4884
4885         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4886                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4887                              arg->cb->nlh->nlmsg_seq, flags);
4888 }
4889
4890 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4891                                         const struct nlmsghdr *nlh,
4892                                         struct nlattr **tb,
4893                                         struct netlink_ext_ack *extack)
4894 {
4895         struct rtmsg *rtm;
4896         int i, err;
4897
4898         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4899                 NL_SET_ERR_MSG_MOD(extack,
4900                                    "Invalid header for get route request");
4901                 return -EINVAL;
4902         }
4903
4904         if (!netlink_strict_get_check(skb))
4905                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4906                                               rtm_ipv6_policy, extack);
4907
4908         rtm = nlmsg_data(nlh);
4909         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4910             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4911             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4912             rtm->rtm_type) {
4913                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4914                 return -EINVAL;
4915         }
4916         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4917                 NL_SET_ERR_MSG_MOD(extack,
4918                                    "Invalid flags for get route request");
4919                 return -EINVAL;
4920         }
4921
4922         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4923                                             rtm_ipv6_policy, extack);
4924         if (err)
4925                 return err;
4926
4927         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4928             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4929                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4930                 return -EINVAL;
4931         }
4932
4933         for (i = 0; i <= RTA_MAX; i++) {
4934                 if (!tb[i])
4935                         continue;
4936
4937                 switch (i) {
4938                 case RTA_SRC:
4939                 case RTA_DST:
4940                 case RTA_IIF:
4941                 case RTA_OIF:
4942                 case RTA_MARK:
4943                 case RTA_UID:
4944                 case RTA_SPORT:
4945                 case RTA_DPORT:
4946                 case RTA_IP_PROTO:
4947                         break;
4948                 default:
4949                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4950                         return -EINVAL;
4951                 }
4952         }
4953
4954         return 0;
4955 }
4956
4957 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4958                               struct netlink_ext_ack *extack)
4959 {
4960         struct net *net = sock_net(in_skb->sk);
4961         struct nlattr *tb[RTA_MAX+1];
4962         int err, iif = 0, oif = 0;
4963         struct fib6_info *from;
4964         struct dst_entry *dst;
4965         struct rt6_info *rt;
4966         struct sk_buff *skb;
4967         struct rtmsg *rtm;
4968         struct flowi6 fl6 = {};
4969         bool fibmatch;
4970
4971         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4972         if (err < 0)
4973                 goto errout;
4974
4975         err = -EINVAL;
4976         rtm = nlmsg_data(nlh);
4977         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4978         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4979
4980         if (tb[RTA_SRC]) {
4981                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4982                         goto errout;
4983
4984                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4985         }
4986
4987         if (tb[RTA_DST]) {
4988                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4989                         goto errout;
4990
4991                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4992         }
4993
4994         if (tb[RTA_IIF])
4995                 iif = nla_get_u32(tb[RTA_IIF]);
4996
4997         if (tb[RTA_OIF])
4998                 oif = nla_get_u32(tb[RTA_OIF]);
4999
5000         if (tb[RTA_MARK])
5001                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5002
5003         if (tb[RTA_UID])
5004                 fl6.flowi6_uid = make_kuid(current_user_ns(),
5005                                            nla_get_u32(tb[RTA_UID]));
5006         else
5007                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5008
5009         if (tb[RTA_SPORT])
5010                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5011
5012         if (tb[RTA_DPORT])
5013                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5014
5015         if (tb[RTA_IP_PROTO]) {
5016                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5017                                                   &fl6.flowi6_proto, AF_INET6,
5018                                                   extack);
5019                 if (err)
5020                         goto errout;
5021         }
5022
5023         if (iif) {
5024                 struct net_device *dev;
5025                 int flags = 0;
5026
5027                 rcu_read_lock();
5028
5029                 dev = dev_get_by_index_rcu(net, iif);
5030                 if (!dev) {
5031                         rcu_read_unlock();
5032                         err = -ENODEV;
5033                         goto errout;
5034                 }
5035
5036                 fl6.flowi6_iif = iif;
5037
5038                 if (!ipv6_addr_any(&fl6.saddr))
5039                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5040
5041                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5042
5043                 rcu_read_unlock();
5044         } else {
5045                 fl6.flowi6_oif = oif;
5046
5047                 dst = ip6_route_output(net, NULL, &fl6);
5048         }
5049
5050
5051         rt = container_of(dst, struct rt6_info, dst);
5052         if (rt->dst.error) {
5053                 err = rt->dst.error;
5054                 ip6_rt_put(rt);
5055                 goto errout;
5056         }
5057
5058         if (rt == net->ipv6.ip6_null_entry) {
5059                 err = rt->dst.error;
5060                 ip6_rt_put(rt);
5061                 goto errout;
5062         }
5063
5064         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5065         if (!skb) {
5066                 ip6_rt_put(rt);
5067                 err = -ENOBUFS;
5068                 goto errout;
5069         }
5070
5071         skb_dst_set(skb, &rt->dst);
5072
5073         rcu_read_lock();
5074         from = rcu_dereference(rt->from);
5075         if (from) {
5076                 if (fibmatch)
5077                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5078                                             iif, RTM_NEWROUTE,
5079                                             NETLINK_CB(in_skb).portid,
5080                                             nlh->nlmsg_seq, 0);
5081                 else
5082                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5083                                             &fl6.saddr, iif, RTM_NEWROUTE,
5084                                             NETLINK_CB(in_skb).portid,
5085                                             nlh->nlmsg_seq, 0);
5086         } else {
5087                 err = -ENETUNREACH;
5088         }
5089         rcu_read_unlock();
5090
5091         if (err < 0) {
5092                 kfree_skb(skb);
5093                 goto errout;
5094         }
5095
5096         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5097 errout:
5098         return err;
5099 }
5100
5101 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5102                      unsigned int nlm_flags)
5103 {
5104         struct sk_buff *skb;
5105         struct net *net = info->nl_net;
5106         u32 seq;
5107         int err;
5108
5109         err = -ENOBUFS;
5110         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5111
5112         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5113         if (!skb)
5114                 goto errout;
5115
5116         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5117                             event, info->portid, seq, nlm_flags);
5118         if (err < 0) {
5119                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5120                 WARN_ON(err == -EMSGSIZE);
5121                 kfree_skb(skb);
5122                 goto errout;
5123         }
5124         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5125                     info->nlh, gfp_any());
5126         return;
5127 errout:
5128         if (err < 0)
5129                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5130 }
5131
5132 static int ip6_route_dev_notify(struct notifier_block *this,
5133                                 unsigned long event, void *ptr)
5134 {
5135         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5136         struct net *net = dev_net(dev);
5137
5138         if (!(dev->flags & IFF_LOOPBACK))
5139                 return NOTIFY_OK;
5140
5141         if (event == NETDEV_REGISTER) {
5142                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5143                 net->ipv6.ip6_null_entry->dst.dev = dev;
5144                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5146                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5147                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5148                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5149                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5150 #endif
5151          } else if (event == NETDEV_UNREGISTER &&
5152                     dev->reg_state != NETREG_UNREGISTERED) {
5153                 /* NETDEV_UNREGISTER could be fired for multiple times by
5154                  * netdev_wait_allrefs(). Make sure we only call this once.
5155                  */
5156                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5157 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5158                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5159                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5160 #endif
5161         }
5162
5163         return NOTIFY_OK;
5164 }
5165
5166 /*
5167  *      /proc
5168  */
5169
5170 #ifdef CONFIG_PROC_FS
5171 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5172 {
5173         struct net *net = (struct net *)seq->private;
5174         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5175                    net->ipv6.rt6_stats->fib_nodes,
5176                    net->ipv6.rt6_stats->fib_route_nodes,
5177                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5178                    net->ipv6.rt6_stats->fib_rt_entries,
5179                    net->ipv6.rt6_stats->fib_rt_cache,
5180                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5181                    net->ipv6.rt6_stats->fib_discarded_routes);
5182
5183         return 0;
5184 }
5185 #endif  /* CONFIG_PROC_FS */
5186
5187 #ifdef CONFIG_SYSCTL
5188
5189 static
5190 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5191                               void __user *buffer, size_t *lenp, loff_t *ppos)
5192 {
5193         struct net *net;
5194         int delay;
5195         int ret;
5196         if (!write)
5197                 return -EINVAL;
5198
5199         net = (struct net *)ctl->extra1;
5200         delay = net->ipv6.sysctl.flush_delay;
5201         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5202         if (ret)
5203                 return ret;
5204
5205         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5206         return 0;
5207 }
5208
5209 static int zero;
5210 static int one = 1;
5211
5212 static struct ctl_table ipv6_route_table_template[] = {
5213         {
5214                 .procname       =       "flush",
5215                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5216                 .maxlen         =       sizeof(int),
5217                 .mode           =       0200,
5218                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5219         },
5220         {
5221                 .procname       =       "gc_thresh",
5222                 .data           =       &ip6_dst_ops_template.gc_thresh,
5223                 .maxlen         =       sizeof(int),
5224                 .mode           =       0644,
5225                 .proc_handler   =       proc_dointvec,
5226         },
5227         {
5228                 .procname       =       "max_size",
5229                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5230                 .maxlen         =       sizeof(int),
5231                 .mode           =       0644,
5232                 .proc_handler   =       proc_dointvec,
5233         },
5234         {
5235                 .procname       =       "gc_min_interval",
5236                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5237                 .maxlen         =       sizeof(int),
5238                 .mode           =       0644,
5239                 .proc_handler   =       proc_dointvec_jiffies,
5240         },
5241         {
5242                 .procname       =       "gc_timeout",
5243                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5244                 .maxlen         =       sizeof(int),
5245                 .mode           =       0644,
5246                 .proc_handler   =       proc_dointvec_jiffies,
5247         },
5248         {
5249                 .procname       =       "gc_interval",
5250                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5251                 .maxlen         =       sizeof(int),
5252                 .mode           =       0644,
5253                 .proc_handler   =       proc_dointvec_jiffies,
5254         },
5255         {
5256                 .procname       =       "gc_elasticity",
5257                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5258                 .maxlen         =       sizeof(int),
5259                 .mode           =       0644,
5260                 .proc_handler   =       proc_dointvec,
5261         },
5262         {
5263                 .procname       =       "mtu_expires",
5264                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5265                 .maxlen         =       sizeof(int),
5266                 .mode           =       0644,
5267                 .proc_handler   =       proc_dointvec_jiffies,
5268         },
5269         {
5270                 .procname       =       "min_adv_mss",
5271                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5272                 .maxlen         =       sizeof(int),
5273                 .mode           =       0644,
5274                 .proc_handler   =       proc_dointvec,
5275         },
5276         {
5277                 .procname       =       "gc_min_interval_ms",
5278                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5279                 .maxlen         =       sizeof(int),
5280                 .mode           =       0644,
5281                 .proc_handler   =       proc_dointvec_ms_jiffies,
5282         },
5283         {
5284                 .procname       =       "skip_notify_on_dev_down",
5285                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5286                 .maxlen         =       sizeof(int),
5287                 .mode           =       0644,
5288                 .proc_handler   =       proc_dointvec,
5289                 .extra1         =       &zero,
5290                 .extra2         =       &one,
5291         },
5292         { }
5293 };
5294
5295 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5296 {
5297         struct ctl_table *table;
5298
5299         table = kmemdup(ipv6_route_table_template,
5300                         sizeof(ipv6_route_table_template),
5301                         GFP_KERNEL);
5302
5303         if (table) {
5304                 table[0].data = &net->ipv6.sysctl.flush_delay;
5305                 table[0].extra1 = net;
5306                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5307                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5308                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5309                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5310                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5311                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5312                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5313                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5314                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5315                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5316
5317                 /* Don't export sysctls to unprivileged users */
5318                 if (net->user_ns != &init_user_ns)
5319                         table[0].procname = NULL;
5320         }
5321
5322         return table;
5323 }
5324 #endif
5325
5326 static int __net_init ip6_route_net_init(struct net *net)
5327 {
5328         int ret = -ENOMEM;
5329
5330         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5331                sizeof(net->ipv6.ip6_dst_ops));
5332
5333         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5334                 goto out_ip6_dst_ops;
5335
5336         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5337                                             sizeof(*net->ipv6.fib6_null_entry),
5338                                             GFP_KERNEL);
5339         if (!net->ipv6.fib6_null_entry)
5340                 goto out_ip6_dst_entries;
5341
5342         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5343                                            sizeof(*net->ipv6.ip6_null_entry),
5344                                            GFP_KERNEL);
5345         if (!net->ipv6.ip6_null_entry)
5346                 goto out_fib6_null_entry;
5347         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5348         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5349                          ip6_template_metrics, true);
5350
5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5352         net->ipv6.fib6_has_custom_rules = false;
5353         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5354                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5355                                                GFP_KERNEL);
5356         if (!net->ipv6.ip6_prohibit_entry)
5357                 goto out_ip6_null_entry;
5358         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5359         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5360                          ip6_template_metrics, true);
5361
5362         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5363                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5364                                                GFP_KERNEL);
5365         if (!net->ipv6.ip6_blk_hole_entry)
5366                 goto out_ip6_prohibit_entry;
5367         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5368         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5369                          ip6_template_metrics, true);
5370 #endif
5371
5372         net->ipv6.sysctl.flush_delay = 0;
5373         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5374         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5375         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5376         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5377         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5378         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5379         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5380         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5381
5382         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5383
5384         ret = 0;
5385 out:
5386         return ret;
5387
5388 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5389 out_ip6_prohibit_entry:
5390         kfree(net->ipv6.ip6_prohibit_entry);
5391 out_ip6_null_entry:
5392         kfree(net->ipv6.ip6_null_entry);
5393 #endif
5394 out_fib6_null_entry:
5395         kfree(net->ipv6.fib6_null_entry);
5396 out_ip6_dst_entries:
5397         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5398 out_ip6_dst_ops:
5399         goto out;
5400 }
5401
5402 static void __net_exit ip6_route_net_exit(struct net *net)
5403 {
5404         kfree(net->ipv6.fib6_null_entry);
5405         kfree(net->ipv6.ip6_null_entry);
5406 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5407         kfree(net->ipv6.ip6_prohibit_entry);
5408         kfree(net->ipv6.ip6_blk_hole_entry);
5409 #endif
5410         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5411 }
5412
5413 static int __net_init ip6_route_net_init_late(struct net *net)
5414 {
5415 #ifdef CONFIG_PROC_FS
5416         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5417                         sizeof(struct ipv6_route_iter));
5418         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5419                         rt6_stats_seq_show, NULL);
5420 #endif
5421         return 0;
5422 }
5423
5424 static void __net_exit ip6_route_net_exit_late(struct net *net)
5425 {
5426 #ifdef CONFIG_PROC_FS
5427         remove_proc_entry("ipv6_route", net->proc_net);
5428         remove_proc_entry("rt6_stats", net->proc_net);
5429 #endif
5430 }
5431
5432 static struct pernet_operations ip6_route_net_ops = {
5433         .init = ip6_route_net_init,
5434         .exit = ip6_route_net_exit,
5435 };
5436
5437 static int __net_init ipv6_inetpeer_init(struct net *net)
5438 {
5439         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5440
5441         if (!bp)
5442                 return -ENOMEM;
5443         inet_peer_base_init(bp);
5444         net->ipv6.peers = bp;
5445         return 0;
5446 }
5447
5448 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5449 {
5450         struct inet_peer_base *bp = net->ipv6.peers;
5451
5452         net->ipv6.peers = NULL;
5453         inetpeer_invalidate_tree(bp);
5454         kfree(bp);
5455 }
5456
5457 static struct pernet_operations ipv6_inetpeer_ops = {
5458         .init   =       ipv6_inetpeer_init,
5459         .exit   =       ipv6_inetpeer_exit,
5460 };
5461
5462 static struct pernet_operations ip6_route_net_late_ops = {
5463         .init = ip6_route_net_init_late,
5464         .exit = ip6_route_net_exit_late,
5465 };
5466
5467 static struct notifier_block ip6_route_dev_notifier = {
5468         .notifier_call = ip6_route_dev_notify,
5469         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5470 };
5471
5472 void __init ip6_route_init_special_entries(void)
5473 {
5474         /* Registering of the loopback is done before this portion of code,
5475          * the loopback reference in rt6_info will not be taken, do it
5476          * manually for init_net */
5477         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5478         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5479         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5480   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5481         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5482         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5483         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5484         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5485   #endif
5486 }
5487
5488 int __init ip6_route_init(void)
5489 {
5490         int ret;
5491         int cpu;
5492
5493         ret = -ENOMEM;
5494         ip6_dst_ops_template.kmem_cachep =
5495                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5496                                   SLAB_HWCACHE_ALIGN, NULL);
5497         if (!ip6_dst_ops_template.kmem_cachep)
5498                 goto out;
5499
5500         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5501         if (ret)
5502                 goto out_kmem_cache;
5503
5504         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5505         if (ret)
5506                 goto out_dst_entries;
5507
5508         ret = register_pernet_subsys(&ip6_route_net_ops);
5509         if (ret)
5510                 goto out_register_inetpeer;
5511
5512         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5513
5514         ret = fib6_init();
5515         if (ret)
5516                 goto out_register_subsys;
5517
5518         ret = xfrm6_init();
5519         if (ret)
5520                 goto out_fib6_init;
5521
5522         ret = fib6_rules_init();
5523         if (ret)
5524                 goto xfrm6_init;
5525
5526         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5527         if (ret)
5528                 goto fib6_rules_init;
5529
5530         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5531                                    inet6_rtm_newroute, NULL, 0);
5532         if (ret < 0)
5533                 goto out_register_late_subsys;
5534
5535         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5536                                    inet6_rtm_delroute, NULL, 0);
5537         if (ret < 0)
5538                 goto out_register_late_subsys;
5539
5540         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5541                                    inet6_rtm_getroute, NULL,
5542                                    RTNL_FLAG_DOIT_UNLOCKED);
5543         if (ret < 0)
5544                 goto out_register_late_subsys;
5545
5546         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5547         if (ret)
5548                 goto out_register_late_subsys;
5549
5550         for_each_possible_cpu(cpu) {
5551                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5552
5553                 INIT_LIST_HEAD(&ul->head);
5554                 spin_lock_init(&ul->lock);
5555         }
5556
5557 out:
5558         return ret;
5559
5560 out_register_late_subsys:
5561         rtnl_unregister_all(PF_INET6);
5562         unregister_pernet_subsys(&ip6_route_net_late_ops);
5563 fib6_rules_init:
5564         fib6_rules_cleanup();
5565 xfrm6_init:
5566         xfrm6_fini();
5567 out_fib6_init:
5568         fib6_gc_cleanup();
5569 out_register_subsys:
5570         unregister_pernet_subsys(&ip6_route_net_ops);
5571 out_register_inetpeer:
5572         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5573 out_dst_entries:
5574         dst_entries_destroy(&ip6_dst_blackhole_ops);
5575 out_kmem_cache:
5576         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5577         goto out;
5578 }
5579
5580 void ip6_route_cleanup(void)
5581 {
5582         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5583         unregister_pernet_subsys(&ip6_route_net_late_ops);
5584         fib6_rules_cleanup();
5585         xfrm6_fini();
5586         fib6_gc_cleanup();
5587         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5588         unregister_pernet_subsys(&ip6_route_net_ops);
5589         dst_entries_destroy(&ip6_dst_blackhole_ops);
5590         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5591 }