]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
net/ipv6: Rename rt6_get_cookie_safe
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct fib6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct fib6_info fib6_null_entry_template = {
287         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .fib6_protocol  = RTPROT_KERNEL,
289         .fib6_metric    = ~(u32)0,
290         .fib6_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319 };
320
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322         .dst = {
323                 .__refcnt       = ATOMIC_INIT(1),
324                 .__use          = 1,
325                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
326                 .error          = -EINVAL,
327                 .input          = dst_discard,
328                 .output         = dst_discard_out,
329         },
330         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
331 };
332
333 #endif
334
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337         struct dst_entry *dst = &rt->dst;
338
339         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340         INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348                                         1, DST_OBSOLETE_FORCE_CHK, flags);
349
350         if (rt) {
351                 rt6_info_init(rt);
352                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353         }
354
355         return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361         struct rt6_info *rt = (struct rt6_info *)dst;
362         struct fib6_info *from = rt->from;
363         struct inet6_dev *idev;
364
365         dst_destroy_metrics_generic(dst);
366         rt6_uncached_list_del(rt);
367
368         idev = rt->rt6i_idev;
369         if (idev) {
370                 rt->rt6i_idev = NULL;
371                 in6_dev_put(idev);
372         }
373
374         rt->from = NULL;
375         fib6_info_release(from);
376 }
377
378 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
379                            int how)
380 {
381         struct rt6_info *rt = (struct rt6_info *)dst;
382         struct inet6_dev *idev = rt->rt6i_idev;
383         struct net_device *loopback_dev =
384                 dev_net(dev)->loopback_dev;
385
386         if (idev && idev->dev != loopback_dev) {
387                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
388                 if (loopback_idev) {
389                         rt->rt6i_idev = loopback_idev;
390                         in6_dev_put(idev);
391                 }
392         }
393 }
394
395 static bool __rt6_check_expired(const struct rt6_info *rt)
396 {
397         if (rt->rt6i_flags & RTF_EXPIRES)
398                 return time_after(jiffies, rt->dst.expires);
399         else
400                 return false;
401 }
402
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405         if (rt->rt6i_flags & RTF_EXPIRES) {
406                 if (time_after(jiffies, rt->dst.expires))
407                         return true;
408         } else if (rt->from) {
409                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
410                         fib6_check_expired(rt->from);
411         }
412         return false;
413 }
414
415 static struct fib6_info *rt6_multipath_select(const struct net *net,
416                                               struct fib6_info *match,
417                                              struct flowi6 *fl6, int oif,
418                                              const struct sk_buff *skb,
419                                              int strict)
420 {
421         struct fib6_info *sibling, *next_sibling;
422
423         /* We might have already computed the hash for ICMPv6 errors. In such
424          * case it will always be non-zero. Otherwise now is the time to do it.
425          */
426         if (!fl6->mp_hash)
427                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
428
429         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
430                 return match;
431
432         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
433                                  fib6_siblings) {
434                 int nh_upper_bound;
435
436                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
437                 if (fl6->mp_hash > nh_upper_bound)
438                         continue;
439                 if (rt6_score_route(sibling, oif, strict) < 0)
440                         break;
441                 match = sibling;
442                 break;
443         }
444
445         return match;
446 }
447
448 /*
449  *      Route lookup. rcu_read_lock() should be held.
450  */
451
452 static inline struct fib6_info *rt6_device_match(struct net *net,
453                                                  struct fib6_info *rt,
454                                                     const struct in6_addr *saddr,
455                                                     int oif,
456                                                     int flags)
457 {
458         struct fib6_info *sprt;
459
460         if (!oif && ipv6_addr_any(saddr) &&
461             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
462                 return rt;
463
464         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
465                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
466
467                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
468                         continue;
469
470                 if (oif) {
471                         if (dev->ifindex == oif)
472                                 return sprt;
473                 } else {
474                         if (ipv6_chk_addr(net, saddr, dev,
475                                           flags & RT6_LOOKUP_F_IFACE))
476                                 return sprt;
477                 }
478         }
479
480         if (oif && flags & RT6_LOOKUP_F_IFACE)
481                 return net->ipv6.fib6_null_entry;
482
483         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
484 }
485
486 #ifdef CONFIG_IPV6_ROUTER_PREF
487 struct __rt6_probe_work {
488         struct work_struct work;
489         struct in6_addr target;
490         struct net_device *dev;
491 };
492
493 static void rt6_probe_deferred(struct work_struct *w)
494 {
495         struct in6_addr mcaddr;
496         struct __rt6_probe_work *work =
497                 container_of(w, struct __rt6_probe_work, work);
498
499         addrconf_addr_solict_mult(&work->target, &mcaddr);
500         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
501         dev_put(work->dev);
502         kfree(work);
503 }
504
505 static void rt6_probe(struct fib6_info *rt)
506 {
507         struct __rt6_probe_work *work;
508         const struct in6_addr *nh_gw;
509         struct neighbour *neigh;
510         struct net_device *dev;
511
512         /*
513          * Okay, this does not seem to be appropriate
514          * for now, however, we need to check if it
515          * is really so; aka Router Reachability Probing.
516          *
517          * Router Reachability Probe MUST be rate-limited
518          * to no more than one per minute.
519          */
520         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
521                 return;
522
523         nh_gw = &rt->fib6_nh.nh_gw;
524         dev = rt->fib6_nh.nh_dev;
525         rcu_read_lock_bh();
526         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
527         if (neigh) {
528                 struct inet6_dev *idev;
529
530                 if (neigh->nud_state & NUD_VALID)
531                         goto out;
532
533                 idev = __in6_dev_get(dev);
534                 work = NULL;
535                 write_lock(&neigh->lock);
536                 if (!(neigh->nud_state & NUD_VALID) &&
537                     time_after(jiffies,
538                                neigh->updated + idev->cnf.rtr_probe_interval)) {
539                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
540                         if (work)
541                                 __neigh_set_probe_once(neigh);
542                 }
543                 write_unlock(&neigh->lock);
544         } else {
545                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
546         }
547
548         if (work) {
549                 INIT_WORK(&work->work, rt6_probe_deferred);
550                 work->target = *nh_gw;
551                 dev_hold(dev);
552                 work->dev = dev;
553                 schedule_work(&work->work);
554         }
555
556 out:
557         rcu_read_unlock_bh();
558 }
559 #else
560 static inline void rt6_probe(struct fib6_info *rt)
561 {
562 }
563 #endif
564
565 /*
566  * Default Router Selection (RFC 2461 6.3.6)
567  */
568 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
569 {
570         const struct net_device *dev = rt->fib6_nh.nh_dev;
571
572         if (!oif || dev->ifindex == oif)
573                 return 2;
574         return 0;
575 }
576
577 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
578 {
579         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
580         struct neighbour *neigh;
581
582         if (rt->fib6_flags & RTF_NONEXTHOP ||
583             !(rt->fib6_flags & RTF_GATEWAY))
584                 return RT6_NUD_SUCCEED;
585
586         rcu_read_lock_bh();
587         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
588                                           &rt->fib6_nh.nh_gw);
589         if (neigh) {
590                 read_lock(&neigh->lock);
591                 if (neigh->nud_state & NUD_VALID)
592                         ret = RT6_NUD_SUCCEED;
593 #ifdef CONFIG_IPV6_ROUTER_PREF
594                 else if (!(neigh->nud_state & NUD_FAILED))
595                         ret = RT6_NUD_SUCCEED;
596                 else
597                         ret = RT6_NUD_FAIL_PROBE;
598 #endif
599                 read_unlock(&neigh->lock);
600         } else {
601                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
602                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
603         }
604         rcu_read_unlock_bh();
605
606         return ret;
607 }
608
609 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
610 {
611         int m;
612
613         m = rt6_check_dev(rt, oif);
614         if (!m && (strict & RT6_LOOKUP_F_IFACE))
615                 return RT6_NUD_FAIL_HARD;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
618 #endif
619         if (strict & RT6_LOOKUP_F_REACHABLE) {
620                 int n = rt6_check_neigh(rt);
621                 if (n < 0)
622                         return n;
623         }
624         return m;
625 }
626
627 /* called with rc_read_lock held */
628 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
629 {
630         const struct net_device *dev = fib6_info_nh_dev(f6i);
631         bool rc = false;
632
633         if (dev) {
634                 const struct inet6_dev *idev = __in6_dev_get(dev);
635
636                 rc = !!idev->cnf.ignore_routes_with_linkdown;
637         }
638
639         return rc;
640 }
641
642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
643                                    int *mpri, struct fib6_info *match,
644                                    bool *do_rr)
645 {
646         int m;
647         bool match_do_rr = false;
648
649         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
650                 goto out;
651
652         if (fib6_ignore_linkdown(rt) &&
653             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
654             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
655                 goto out;
656
657         if (fib6_check_expired(rt))
658                 goto out;
659
660         m = rt6_score_route(rt, oif, strict);
661         if (m == RT6_NUD_FAIL_DO_RR) {
662                 match_do_rr = true;
663                 m = 0; /* lowest valid score */
664         } else if (m == RT6_NUD_FAIL_HARD) {
665                 goto out;
666         }
667
668         if (strict & RT6_LOOKUP_F_REACHABLE)
669                 rt6_probe(rt);
670
671         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
672         if (m > *mpri) {
673                 *do_rr = match_do_rr;
674                 *mpri = m;
675                 match = rt;
676         }
677 out:
678         return match;
679 }
680
681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
682                                      struct fib6_info *leaf,
683                                      struct fib6_info *rr_head,
684                                      u32 metric, int oif, int strict,
685                                      bool *do_rr)
686 {
687         struct fib6_info *rt, *match, *cont;
688         int mpri = -1;
689
690         match = NULL;
691         cont = NULL;
692         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
693                 if (rt->fib6_metric != metric) {
694                         cont = rt;
695                         break;
696                 }
697
698                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
699         }
700
701         for (rt = leaf; rt && rt != rr_head;
702              rt = rcu_dereference(rt->rt6_next)) {
703                 if (rt->fib6_metric != metric) {
704                         cont = rt;
705                         break;
706                 }
707
708                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
709         }
710
711         if (match || !cont)
712                 return match;
713
714         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716
717         return match;
718 }
719
720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
721                                    int oif, int strict)
722 {
723         struct fib6_info *leaf = rcu_dereference(fn->leaf);
724         struct fib6_info *match, *rt0;
725         bool do_rr = false;
726         int key_plen;
727
728         if (!leaf || leaf == net->ipv6.fib6_null_entry)
729                 return net->ipv6.fib6_null_entry;
730
731         rt0 = rcu_dereference(fn->rr_ptr);
732         if (!rt0)
733                 rt0 = leaf;
734
735         /* Double check to make sure fn is not an intermediate node
736          * and fn->leaf does not points to its child's leaf
737          * (This might happen if all routes under fn are deleted from
738          * the tree and fib6_repair_tree() is called on the node.)
739          */
740         key_plen = rt0->fib6_dst.plen;
741 #ifdef CONFIG_IPV6_SUBTREES
742         if (rt0->fib6_src.plen)
743                 key_plen = rt0->fib6_src.plen;
744 #endif
745         if (fn->fn_bit != key_plen)
746                 return net->ipv6.fib6_null_entry;
747
748         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
749                              &do_rr);
750
751         if (do_rr) {
752                 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
753
754                 /* no entries matched; do round-robin */
755                 if (!next || next->fib6_metric != rt0->fib6_metric)
756                         next = leaf;
757
758                 if (next != rt0) {
759                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
760                         /* make sure next is not being deleted from the tree */
761                         if (next->fib6_node)
762                                 rcu_assign_pointer(fn->rr_ptr, next);
763                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
764                 }
765         }
766
767         return match ? match : net->ipv6.fib6_null_entry;
768 }
769
770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
771 {
772         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
773 }
774
775 #ifdef CONFIG_IPV6_ROUTE_INFO
776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777                   const struct in6_addr *gwaddr)
778 {
779         struct net *net = dev_net(dev);
780         struct route_info *rinfo = (struct route_info *) opt;
781         struct in6_addr prefix_buf, *prefix;
782         unsigned int pref;
783         unsigned long lifetime;
784         struct fib6_info *rt;
785
786         if (len < sizeof(struct route_info)) {
787                 return -EINVAL;
788         }
789
790         /* Sanity check for prefix_len and length */
791         if (rinfo->length > 3) {
792                 return -EINVAL;
793         } else if (rinfo->prefix_len > 128) {
794                 return -EINVAL;
795         } else if (rinfo->prefix_len > 64) {
796                 if (rinfo->length < 2) {
797                         return -EINVAL;
798                 }
799         } else if (rinfo->prefix_len > 0) {
800                 if (rinfo->length < 1) {
801                         return -EINVAL;
802                 }
803         }
804
805         pref = rinfo->route_pref;
806         if (pref == ICMPV6_ROUTER_PREF_INVALID)
807                 return -EINVAL;
808
809         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
810
811         if (rinfo->length == 3)
812                 prefix = (struct in6_addr *)rinfo->prefix;
813         else {
814                 /* this function is safe */
815                 ipv6_addr_prefix(&prefix_buf,
816                                  (struct in6_addr *)rinfo->prefix,
817                                  rinfo->prefix_len);
818                 prefix = &prefix_buf;
819         }
820
821         if (rinfo->prefix_len == 0)
822                 rt = rt6_get_dflt_router(net, gwaddr, dev);
823         else
824                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
825                                         gwaddr, dev);
826
827         if (rt && !lifetime) {
828                 ip6_del_rt(net, rt);
829                 rt = NULL;
830         }
831
832         if (!rt && lifetime)
833                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
834                                         dev, pref);
835         else if (rt)
836                 rt->fib6_flags = RTF_ROUTEINFO |
837                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
838
839         if (rt) {
840                 if (!addrconf_finite_timeout(lifetime))
841                         fib6_clean_expires(rt);
842                 else
843                         fib6_set_expires(rt, jiffies + HZ * lifetime);
844
845                 fib6_info_release(rt);
846         }
847         return 0;
848 }
849 #endif
850
851 /*
852  *      Misc support functions
853  */
854
855 /* called with rcu_lock held */
856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
857 {
858         struct net_device *dev = rt->fib6_nh.nh_dev;
859
860         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861                 /* for copies of local routes, dst->dev needs to be the
862                  * device if it is a master device, the master device if
863                  * device is enslaved, and the loopback as the default
864                  */
865                 if (netif_is_l3_slave(dev) &&
866                     !rt6_need_strict(&rt->fib6_dst.addr))
867                         dev = l3mdev_master_dev_rcu(dev);
868                 else if (!netif_is_l3_master(dev))
869                         dev = dev_net(dev)->loopback_dev;
870                 /* last case is netif_is_l3_master(dev) is true in which
871                  * case we want dev returned to be dev
872                  */
873         }
874
875         return dev;
876 }
877
878 static const int fib6_prop[RTN_MAX + 1] = {
879         [RTN_UNSPEC]    = 0,
880         [RTN_UNICAST]   = 0,
881         [RTN_LOCAL]     = 0,
882         [RTN_BROADCAST] = 0,
883         [RTN_ANYCAST]   = 0,
884         [RTN_MULTICAST] = 0,
885         [RTN_BLACKHOLE] = -EINVAL,
886         [RTN_UNREACHABLE] = -EHOSTUNREACH,
887         [RTN_PROHIBIT]  = -EACCES,
888         [RTN_THROW]     = -EAGAIN,
889         [RTN_NAT]       = -EINVAL,
890         [RTN_XRESOLVE]  = -EINVAL,
891 };
892
893 static int ip6_rt_type_to_error(u8 fib6_type)
894 {
895         return fib6_prop[fib6_type];
896 }
897
898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
899 {
900         unsigned short flags = 0;
901
902         if (rt->dst_nocount)
903                 flags |= DST_NOCOUNT;
904         if (rt->dst_nopolicy)
905                 flags |= DST_NOPOLICY;
906         if (rt->dst_host)
907                 flags |= DST_HOST;
908
909         return flags;
910 }
911
912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
913 {
914         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
915
916         switch (ort->fib6_type) {
917         case RTN_BLACKHOLE:
918                 rt->dst.output = dst_discard_out;
919                 rt->dst.input = dst_discard;
920                 break;
921         case RTN_PROHIBIT:
922                 rt->dst.output = ip6_pkt_prohibit_out;
923                 rt->dst.input = ip6_pkt_prohibit;
924                 break;
925         case RTN_THROW:
926         case RTN_UNREACHABLE:
927         default:
928                 rt->dst.output = ip6_pkt_discard_out;
929                 rt->dst.input = ip6_pkt_discard;
930                 break;
931         }
932 }
933
934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
935 {
936         rt->dst.flags |= fib6_info_dst_flags(ort);
937
938         if (ort->fib6_flags & RTF_REJECT) {
939                 ip6_rt_init_dst_reject(rt, ort);
940                 return;
941         }
942
943         rt->dst.error = 0;
944         rt->dst.output = ip6_output;
945
946         if (ort->fib6_type == RTN_LOCAL) {
947                 rt->dst.input = ip6_input;
948         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
949                 rt->dst.input = ip6_mc_input;
950         } else {
951                 rt->dst.input = ip6_forward;
952         }
953
954         if (ort->fib6_nh.nh_lwtstate) {
955                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
956                 lwtunnel_set_redirect(&rt->dst);
957         }
958
959         rt->dst.lastuse = jiffies;
960 }
961
962 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
963 {
964         rt->rt6i_flags &= ~RTF_EXPIRES;
965         fib6_info_hold(from);
966         rt->from = from;
967         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
968         if (from->fib6_metrics != &dst_default_metrics) {
969                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
970                 refcount_inc(&from->fib6_metrics->refcnt);
971         }
972 }
973
974 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
975 {
976         struct net_device *dev = fib6_info_nh_dev(ort);
977
978         ip6_rt_init_dst(rt, ort);
979
980         rt->rt6i_dst = ort->fib6_dst;
981         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
982         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
983         rt->rt6i_flags = ort->fib6_flags;
984         rt6_set_from(rt, ort);
985 #ifdef CONFIG_IPV6_SUBTREES
986         rt->rt6i_src = ort->fib6_src;
987 #endif
988         rt->rt6i_prefsrc = ort->fib6_prefsrc;
989         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
990 }
991
992 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
993                                         struct in6_addr *saddr)
994 {
995         struct fib6_node *pn, *sn;
996         while (1) {
997                 if (fn->fn_flags & RTN_TL_ROOT)
998                         return NULL;
999                 pn = rcu_dereference(fn->parent);
1000                 sn = FIB6_SUBTREE(pn);
1001                 if (sn && sn != fn)
1002                         fn = fib6_lookup(sn, NULL, saddr);
1003                 else
1004                         fn = pn;
1005                 if (fn->fn_flags & RTN_RTINFO)
1006                         return fn;
1007         }
1008 }
1009
1010 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1011                           bool null_fallback)
1012 {
1013         struct rt6_info *rt = *prt;
1014
1015         if (dst_hold_safe(&rt->dst))
1016                 return true;
1017         if (null_fallback) {
1018                 rt = net->ipv6.ip6_null_entry;
1019                 dst_hold(&rt->dst);
1020         } else {
1021                 rt = NULL;
1022         }
1023         *prt = rt;
1024         return false;
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1029 {
1030         unsigned short flags = fib6_info_dst_flags(rt);
1031         struct net_device *dev = rt->fib6_nh.nh_dev;
1032         struct rt6_info *nrt;
1033
1034         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1035         if (nrt)
1036                 ip6_rt_copy_init(nrt, rt);
1037
1038         return nrt;
1039 }
1040
1041 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1042                                              struct fib6_table *table,
1043                                              struct flowi6 *fl6,
1044                                              const struct sk_buff *skb,
1045                                              int flags)
1046 {
1047         struct fib6_info *f6i;
1048         struct fib6_node *fn;
1049         struct rt6_info *rt;
1050
1051         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1052                 flags &= ~RT6_LOOKUP_F_IFACE;
1053
1054         rcu_read_lock();
1055         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1056 restart:
1057         f6i = rcu_dereference(fn->leaf);
1058         if (!f6i) {
1059                 f6i = net->ipv6.fib6_null_entry;
1060         } else {
1061                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1062                                       fl6->flowi6_oif, flags);
1063                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1064                         f6i = rt6_multipath_select(net, f6i, fl6,
1065                                                    fl6->flowi6_oif, skb, flags);
1066         }
1067         if (f6i == net->ipv6.fib6_null_entry) {
1068                 fn = fib6_backtrack(fn, &fl6->saddr);
1069                 if (fn)
1070                         goto restart;
1071         }
1072
1073         /* Search through exception table */
1074         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1075         if (rt) {
1076                 if (ip6_hold_safe(net, &rt, true))
1077                         dst_use_noref(&rt->dst, jiffies);
1078         } else if (f6i == net->ipv6.fib6_null_entry) {
1079                 rt = net->ipv6.ip6_null_entry;
1080                 dst_hold(&rt->dst);
1081         } else {
1082                 rt = ip6_create_rt_rcu(f6i);
1083                 if (!rt) {
1084                         rt = net->ipv6.ip6_null_entry;
1085                         dst_hold(&rt->dst);
1086                 }
1087         }
1088
1089         rcu_read_unlock();
1090
1091         trace_fib6_table_lookup(net, rt, table, fl6);
1092
1093         return rt;
1094 }
1095
1096 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1097                                    const struct sk_buff *skb, int flags)
1098 {
1099         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1102
1103 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1104                             const struct in6_addr *saddr, int oif,
1105                             const struct sk_buff *skb, int strict)
1106 {
1107         struct flowi6 fl6 = {
1108                 .flowi6_oif = oif,
1109                 .daddr = *daddr,
1110         };
1111         struct dst_entry *dst;
1112         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1113
1114         if (saddr) {
1115                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1116                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1117         }
1118
1119         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1120         if (dst->error == 0)
1121                 return (struct rt6_info *) dst;
1122
1123         dst_release(dst);
1124
1125         return NULL;
1126 }
1127 EXPORT_SYMBOL(rt6_lookup);
1128
1129 /* ip6_ins_rt is called with FREE table->tb6_lock.
1130  * It takes new route entry, the addition fails by any reason the
1131  * route is released.
1132  * Caller must hold dst before calling it.
1133  */
1134
1135 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1136                         struct netlink_ext_ack *extack)
1137 {
1138         int err;
1139         struct fib6_table *table;
1140
1141         table = rt->fib6_table;
1142         spin_lock_bh(&table->tb6_lock);
1143         err = fib6_add(&table->tb6_root, rt, info, extack);
1144         spin_unlock_bh(&table->tb6_lock);
1145
1146         return err;
1147 }
1148
1149 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1150 {
1151         struct nl_info info = { .nl_net = net, };
1152
1153         return __ip6_ins_rt(rt, &info, NULL);
1154 }
1155
1156 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1157                                            const struct in6_addr *daddr,
1158                                            const struct in6_addr *saddr)
1159 {
1160         struct net_device *dev;
1161         struct rt6_info *rt;
1162
1163         /*
1164          *      Clone the route.
1165          */
1166
1167         rcu_read_lock();
1168         dev = ip6_rt_get_dev_rcu(ort);
1169         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1170         rcu_read_unlock();
1171         if (!rt)
1172                 return NULL;
1173
1174         ip6_rt_copy_init(rt, ort);
1175         rt->rt6i_flags |= RTF_CACHE;
1176         rt->dst.flags |= DST_HOST;
1177         rt->rt6i_dst.addr = *daddr;
1178         rt->rt6i_dst.plen = 128;
1179
1180         if (!rt6_is_gw_or_nonexthop(ort)) {
1181                 if (ort->fib6_dst.plen != 128 &&
1182                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1183                         rt->rt6i_flags |= RTF_ANYCAST;
1184 #ifdef CONFIG_IPV6_SUBTREES
1185                 if (rt->rt6i_src.plen && saddr) {
1186                         rt->rt6i_src.addr = *saddr;
1187                         rt->rt6i_src.plen = 128;
1188                 }
1189 #endif
1190         }
1191
1192         return rt;
1193 }
1194
1195 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1196 {
1197         unsigned short flags = fib6_info_dst_flags(rt);
1198         struct net_device *dev;
1199         struct rt6_info *pcpu_rt;
1200
1201         rcu_read_lock();
1202         dev = ip6_rt_get_dev_rcu(rt);
1203         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1204         rcu_read_unlock();
1205         if (!pcpu_rt)
1206                 return NULL;
1207         ip6_rt_copy_init(pcpu_rt, rt);
1208         pcpu_rt->rt6i_flags |= RTF_PCPU;
1209         return pcpu_rt;
1210 }
1211
1212 /* It should be called with rcu_read_lock() acquired */
1213 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1214 {
1215         struct rt6_info *pcpu_rt, **p;
1216
1217         p = this_cpu_ptr(rt->rt6i_pcpu);
1218         pcpu_rt = *p;
1219
1220         if (pcpu_rt)
1221                 ip6_hold_safe(NULL, &pcpu_rt, false);
1222
1223         return pcpu_rt;
1224 }
1225
1226 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1227                                             struct fib6_info *rt)
1228 {
1229         struct rt6_info *pcpu_rt, *prev, **p;
1230
1231         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1232         if (!pcpu_rt) {
1233                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1234                 return net->ipv6.ip6_null_entry;
1235         }
1236
1237         dst_hold(&pcpu_rt->dst);
1238         p = this_cpu_ptr(rt->rt6i_pcpu);
1239         prev = cmpxchg(p, NULL, pcpu_rt);
1240         BUG_ON(prev);
1241
1242         return pcpu_rt;
1243 }
1244
1245 /* exception hash table implementation
1246  */
1247 static DEFINE_SPINLOCK(rt6_exception_lock);
1248
1249 /* Remove rt6_ex from hash table and free the memory
1250  * Caller must hold rt6_exception_lock
1251  */
1252 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1253                                  struct rt6_exception *rt6_ex)
1254 {
1255         struct net *net;
1256
1257         if (!bucket || !rt6_ex)
1258                 return;
1259
1260         net = dev_net(rt6_ex->rt6i->dst.dev);
1261         hlist_del_rcu(&rt6_ex->hlist);
1262         dst_release(&rt6_ex->rt6i->dst);
1263         kfree_rcu(rt6_ex, rcu);
1264         WARN_ON_ONCE(!bucket->depth);
1265         bucket->depth--;
1266         net->ipv6.rt6_stats->fib_rt_cache--;
1267 }
1268
1269 /* Remove oldest rt6_ex in bucket and free the memory
1270  * Caller must hold rt6_exception_lock
1271  */
1272 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1273 {
1274         struct rt6_exception *rt6_ex, *oldest = NULL;
1275
1276         if (!bucket)
1277                 return;
1278
1279         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1280                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1281                         oldest = rt6_ex;
1282         }
1283         rt6_remove_exception(bucket, oldest);
1284 }
1285
1286 static u32 rt6_exception_hash(const struct in6_addr *dst,
1287                               const struct in6_addr *src)
1288 {
1289         static u32 seed __read_mostly;
1290         u32 val;
1291
1292         net_get_random_once(&seed, sizeof(seed));
1293         val = jhash(dst, sizeof(*dst), seed);
1294
1295 #ifdef CONFIG_IPV6_SUBTREES
1296         if (src)
1297                 val = jhash(src, sizeof(*src), val);
1298 #endif
1299         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1300 }
1301
1302 /* Helper function to find the cached rt in the hash table
1303  * and update bucket pointer to point to the bucket for this
1304  * (daddr, saddr) pair
1305  * Caller must hold rt6_exception_lock
1306  */
1307 static struct rt6_exception *
1308 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1309                               const struct in6_addr *daddr,
1310                               const struct in6_addr *saddr)
1311 {
1312         struct rt6_exception *rt6_ex;
1313         u32 hval;
1314
1315         if (!(*bucket) || !daddr)
1316                 return NULL;
1317
1318         hval = rt6_exception_hash(daddr, saddr);
1319         *bucket += hval;
1320
1321         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1322                 struct rt6_info *rt6 = rt6_ex->rt6i;
1323                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1324
1325 #ifdef CONFIG_IPV6_SUBTREES
1326                 if (matched && saddr)
1327                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1328 #endif
1329                 if (matched)
1330                         return rt6_ex;
1331         }
1332         return NULL;
1333 }
1334
1335 /* Helper function to find the cached rt in the hash table
1336  * and update bucket pointer to point to the bucket for this
1337  * (daddr, saddr) pair
1338  * Caller must hold rcu_read_lock()
1339  */
1340 static struct rt6_exception *
1341 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1342                          const struct in6_addr *daddr,
1343                          const struct in6_addr *saddr)
1344 {
1345         struct rt6_exception *rt6_ex;
1346         u32 hval;
1347
1348         WARN_ON_ONCE(!rcu_read_lock_held());
1349
1350         if (!(*bucket) || !daddr)
1351                 return NULL;
1352
1353         hval = rt6_exception_hash(daddr, saddr);
1354         *bucket += hval;
1355
1356         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1357                 struct rt6_info *rt6 = rt6_ex->rt6i;
1358                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1359
1360 #ifdef CONFIG_IPV6_SUBTREES
1361                 if (matched && saddr)
1362                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1363 #endif
1364                 if (matched)
1365                         return rt6_ex;
1366         }
1367         return NULL;
1368 }
1369
1370 static unsigned int fib6_mtu(const struct fib6_info *rt)
1371 {
1372         unsigned int mtu;
1373
1374         if (rt->fib6_pmtu) {
1375                 mtu = rt->fib6_pmtu;
1376         } else {
1377                 struct net_device *dev = fib6_info_nh_dev(rt);
1378                 struct inet6_dev *idev;
1379
1380                 rcu_read_lock();
1381                 idev = __in6_dev_get(dev);
1382                 mtu = idev->cnf.mtu6;
1383                 rcu_read_unlock();
1384         }
1385
1386         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1387
1388         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1389 }
1390
1391 static int rt6_insert_exception(struct rt6_info *nrt,
1392                                 struct fib6_info *ort)
1393 {
1394         struct net *net = dev_net(nrt->dst.dev);
1395         struct rt6_exception_bucket *bucket;
1396         struct in6_addr *src_key = NULL;
1397         struct rt6_exception *rt6_ex;
1398         int err = 0;
1399
1400         spin_lock_bh(&rt6_exception_lock);
1401
1402         if (ort->exception_bucket_flushed) {
1403                 err = -EINVAL;
1404                 goto out;
1405         }
1406
1407         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1408                                         lockdep_is_held(&rt6_exception_lock));
1409         if (!bucket) {
1410                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1411                                  GFP_ATOMIC);
1412                 if (!bucket) {
1413                         err = -ENOMEM;
1414                         goto out;
1415                 }
1416                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1417         }
1418
1419 #ifdef CONFIG_IPV6_SUBTREES
1420         /* rt6i_src.plen != 0 indicates ort is in subtree
1421          * and exception table is indexed by a hash of
1422          * both rt6i_dst and rt6i_src.
1423          * Otherwise, the exception table is indexed by
1424          * a hash of only rt6i_dst.
1425          */
1426         if (ort->fib6_src.plen)
1427                 src_key = &nrt->rt6i_src.addr;
1428 #endif
1429
1430         /* Update rt6i_prefsrc as it could be changed
1431          * in rt6_remove_prefsrc()
1432          */
1433         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1434         /* rt6_mtu_change() might lower mtu on ort.
1435          * Only insert this exception route if its mtu
1436          * is less than ort's mtu value.
1437          */
1438         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1439                 err = -EINVAL;
1440                 goto out;
1441         }
1442
1443         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1444                                                src_key);
1445         if (rt6_ex)
1446                 rt6_remove_exception(bucket, rt6_ex);
1447
1448         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1449         if (!rt6_ex) {
1450                 err = -ENOMEM;
1451                 goto out;
1452         }
1453         rt6_ex->rt6i = nrt;
1454         rt6_ex->stamp = jiffies;
1455         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1456         bucket->depth++;
1457         net->ipv6.rt6_stats->fib_rt_cache++;
1458
1459         if (bucket->depth > FIB6_MAX_DEPTH)
1460                 rt6_exception_remove_oldest(bucket);
1461
1462 out:
1463         spin_unlock_bh(&rt6_exception_lock);
1464
1465         /* Update fn->fn_sernum to invalidate all cached dst */
1466         if (!err) {
1467                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1468                 fib6_update_sernum(net, ort);
1469                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1470                 fib6_force_start_gc(net);
1471         }
1472
1473         return err;
1474 }
1475
1476 void rt6_flush_exceptions(struct fib6_info *rt)
1477 {
1478         struct rt6_exception_bucket *bucket;
1479         struct rt6_exception *rt6_ex;
1480         struct hlist_node *tmp;
1481         int i;
1482
1483         spin_lock_bh(&rt6_exception_lock);
1484         /* Prevent rt6_insert_exception() to recreate the bucket list */
1485         rt->exception_bucket_flushed = 1;
1486
1487         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1488                                     lockdep_is_held(&rt6_exception_lock));
1489         if (!bucket)
1490                 goto out;
1491
1492         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1493                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1494                         rt6_remove_exception(bucket, rt6_ex);
1495                 WARN_ON_ONCE(bucket->depth);
1496                 bucket++;
1497         }
1498
1499 out:
1500         spin_unlock_bh(&rt6_exception_lock);
1501 }
1502
1503 /* Find cached rt in the hash table inside passed in rt
1504  * Caller has to hold rcu_read_lock()
1505  */
1506 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1507                                            struct in6_addr *daddr,
1508                                            struct in6_addr *saddr)
1509 {
1510         struct rt6_exception_bucket *bucket;
1511         struct in6_addr *src_key = NULL;
1512         struct rt6_exception *rt6_ex;
1513         struct rt6_info *res = NULL;
1514
1515         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1516
1517 #ifdef CONFIG_IPV6_SUBTREES
1518         /* rt6i_src.plen != 0 indicates rt is in subtree
1519          * and exception table is indexed by a hash of
1520          * both rt6i_dst and rt6i_src.
1521          * Otherwise, the exception table is indexed by
1522          * a hash of only rt6i_dst.
1523          */
1524         if (rt->fib6_src.plen)
1525                 src_key = saddr;
1526 #endif
1527         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1528
1529         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1530                 res = rt6_ex->rt6i;
1531
1532         return res;
1533 }
1534
1535 /* Remove the passed in cached rt from the hash table that contains it */
1536 static int rt6_remove_exception_rt(struct rt6_info *rt)
1537 {
1538         struct rt6_exception_bucket *bucket;
1539         struct fib6_info *from = rt->from;
1540         struct in6_addr *src_key = NULL;
1541         struct rt6_exception *rt6_ex;
1542         int err;
1543
1544         if (!from ||
1545             !(rt->rt6i_flags & RTF_CACHE))
1546                 return -EINVAL;
1547
1548         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1549                 return -ENOENT;
1550
1551         spin_lock_bh(&rt6_exception_lock);
1552         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1553                                     lockdep_is_held(&rt6_exception_lock));
1554 #ifdef CONFIG_IPV6_SUBTREES
1555         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1556          * and exception table is indexed by a hash of
1557          * both rt6i_dst and rt6i_src.
1558          * Otherwise, the exception table is indexed by
1559          * a hash of only rt6i_dst.
1560          */
1561         if (from->fib6_src.plen)
1562                 src_key = &rt->rt6i_src.addr;
1563 #endif
1564         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1565                                                &rt->rt6i_dst.addr,
1566                                                src_key);
1567         if (rt6_ex) {
1568                 rt6_remove_exception(bucket, rt6_ex);
1569                 err = 0;
1570         } else {
1571                 err = -ENOENT;
1572         }
1573
1574         spin_unlock_bh(&rt6_exception_lock);
1575         return err;
1576 }
1577
1578 /* Find rt6_ex which contains the passed in rt cache and
1579  * refresh its stamp
1580  */
1581 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1582 {
1583         struct rt6_exception_bucket *bucket;
1584         struct fib6_info *from = rt->from;
1585         struct in6_addr *src_key = NULL;
1586         struct rt6_exception *rt6_ex;
1587
1588         if (!from ||
1589             !(rt->rt6i_flags & RTF_CACHE))
1590                 return;
1591
1592         rcu_read_lock();
1593         bucket = rcu_dereference(from->rt6i_exception_bucket);
1594
1595 #ifdef CONFIG_IPV6_SUBTREES
1596         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1597          * and exception table is indexed by a hash of
1598          * both rt6i_dst and rt6i_src.
1599          * Otherwise, the exception table is indexed by
1600          * a hash of only rt6i_dst.
1601          */
1602         if (from->fib6_src.plen)
1603                 src_key = &rt->rt6i_src.addr;
1604 #endif
1605         rt6_ex = __rt6_find_exception_rcu(&bucket,
1606                                           &rt->rt6i_dst.addr,
1607                                           src_key);
1608         if (rt6_ex)
1609                 rt6_ex->stamp = jiffies;
1610
1611         rcu_read_unlock();
1612 }
1613
1614 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1615 {
1616         struct rt6_exception_bucket *bucket;
1617         struct rt6_exception *rt6_ex;
1618         int i;
1619
1620         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1621                                         lockdep_is_held(&rt6_exception_lock));
1622
1623         if (bucket) {
1624                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1625                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1626                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1627                         }
1628                         bucket++;
1629                 }
1630         }
1631 }
1632
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634                                          struct rt6_info *rt, int mtu)
1635 {
1636         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637          * lowest MTU in the path: always allow updating the route PMTU to
1638          * reflect PMTU decreases.
1639          *
1640          * If the new MTU is higher, and the route PMTU is equal to the local
1641          * MTU, this means the old MTU is the lowest in the path, so allow
1642          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1643          * handle this.
1644          */
1645
1646         if (dst_mtu(&rt->dst) >= mtu)
1647                 return true;
1648
1649         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1650                 return true;
1651
1652         return false;
1653 }
1654
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656                                        struct fib6_info *rt, int mtu)
1657 {
1658         struct rt6_exception_bucket *bucket;
1659         struct rt6_exception *rt6_ex;
1660         int i;
1661
1662         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663                                         lockdep_is_held(&rt6_exception_lock));
1664
1665         if (!bucket)
1666                 return;
1667
1668         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670                         struct rt6_info *entry = rt6_ex->rt6i;
1671
1672                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673                          * route), the metrics of its rt->from have already
1674                          * been updated.
1675                          */
1676                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1678                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1679                 }
1680                 bucket++;
1681         }
1682 }
1683
1684 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1685
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687                                         struct in6_addr *gateway)
1688 {
1689         struct rt6_exception_bucket *bucket;
1690         struct rt6_exception *rt6_ex;
1691         struct hlist_node *tmp;
1692         int i;
1693
1694         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1695                 return;
1696
1697         spin_lock_bh(&rt6_exception_lock);
1698         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699                                      lockdep_is_held(&rt6_exception_lock));
1700
1701         if (bucket) {
1702                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703                         hlist_for_each_entry_safe(rt6_ex, tmp,
1704                                                   &bucket->chain, hlist) {
1705                                 struct rt6_info *entry = rt6_ex->rt6i;
1706
1707                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708                                     RTF_CACHE_GATEWAY &&
1709                                     ipv6_addr_equal(gateway,
1710                                                     &entry->rt6i_gateway)) {
1711                                         rt6_remove_exception(bucket, rt6_ex);
1712                                 }
1713                         }
1714                         bucket++;
1715                 }
1716         }
1717
1718         spin_unlock_bh(&rt6_exception_lock);
1719 }
1720
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722                                       struct rt6_exception *rt6_ex,
1723                                       struct fib6_gc_args *gc_args,
1724                                       unsigned long now)
1725 {
1726         struct rt6_info *rt = rt6_ex->rt6i;
1727
1728         /* we are pruning and obsoleting aged-out and non gateway exceptions
1729          * even if others have still references to them, so that on next
1730          * dst_check() such references can be dropped.
1731          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732          * expired, independently from their aging, as per RFC 8201 section 4
1733          */
1734         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736                         RT6_TRACE("aging clone %p\n", rt);
1737                         rt6_remove_exception(bucket, rt6_ex);
1738                         return;
1739                 }
1740         } else if (time_after(jiffies, rt->dst.expires)) {
1741                 RT6_TRACE("purging expired route %p\n", rt);
1742                 rt6_remove_exception(bucket, rt6_ex);
1743                 return;
1744         }
1745
1746         if (rt->rt6i_flags & RTF_GATEWAY) {
1747                 struct neighbour *neigh;
1748                 __u8 neigh_flags = 0;
1749
1750                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1751                 if (neigh)
1752                         neigh_flags = neigh->flags;
1753
1754                 if (!(neigh_flags & NTF_ROUTER)) {
1755                         RT6_TRACE("purging route %p via non-router but gateway\n",
1756                                   rt);
1757                         rt6_remove_exception(bucket, rt6_ex);
1758                         return;
1759                 }
1760         }
1761
1762         gc_args->more++;
1763 }
1764
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766                         struct fib6_gc_args *gc_args,
1767                         unsigned long now)
1768 {
1769         struct rt6_exception_bucket *bucket;
1770         struct rt6_exception *rt6_ex;
1771         struct hlist_node *tmp;
1772         int i;
1773
1774         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1775                 return;
1776
1777         rcu_read_lock_bh();
1778         spin_lock(&rt6_exception_lock);
1779         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780                                     lockdep_is_held(&rt6_exception_lock));
1781
1782         if (bucket) {
1783                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784                         hlist_for_each_entry_safe(rt6_ex, tmp,
1785                                                   &bucket->chain, hlist) {
1786                                 rt6_age_examine_exception(bucket, rt6_ex,
1787                                                           gc_args, now);
1788                         }
1789                         bucket++;
1790                 }
1791         }
1792         spin_unlock(&rt6_exception_lock);
1793         rcu_read_unlock_bh();
1794 }
1795
1796 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1797                                int oif, struct flowi6 *fl6,
1798                                const struct sk_buff *skb, int flags)
1799 {
1800         struct fib6_node *fn, *saved_fn;
1801         struct fib6_info *f6i;
1802         struct rt6_info *rt;
1803         int strict = 0;
1804
1805         strict |= flags & RT6_LOOKUP_F_IFACE;
1806         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1807         if (net->ipv6.devconf_all->forwarding == 0)
1808                 strict |= RT6_LOOKUP_F_REACHABLE;
1809
1810         rcu_read_lock();
1811
1812         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1813         saved_fn = fn;
1814
1815         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1816                 oif = 0;
1817
1818 redo_rt6_select:
1819         f6i = rt6_select(net, fn, oif, strict);
1820         if (f6i->fib6_nsiblings)
1821                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1822         if (f6i == net->ipv6.fib6_null_entry) {
1823                 fn = fib6_backtrack(fn, &fl6->saddr);
1824                 if (fn)
1825                         goto redo_rt6_select;
1826                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1827                         /* also consider unreachable route */
1828                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1829                         fn = saved_fn;
1830                         goto redo_rt6_select;
1831                 }
1832         }
1833
1834         if (f6i == net->ipv6.fib6_null_entry) {
1835                 rt = net->ipv6.ip6_null_entry;
1836                 rcu_read_unlock();
1837                 dst_hold(&rt->dst);
1838                 trace_fib6_table_lookup(net, rt, table, fl6);
1839                 return rt;
1840         }
1841
1842         /*Search through exception table */
1843         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1844         if (rt) {
1845                 if (ip6_hold_safe(net, &rt, true))
1846                         dst_use_noref(&rt->dst, jiffies);
1847
1848                 rcu_read_unlock();
1849                 trace_fib6_table_lookup(net, rt, table, fl6);
1850                 return rt;
1851         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1852                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1853                 /* Create a RTF_CACHE clone which will not be
1854                  * owned by the fib6 tree.  It is for the special case where
1855                  * the daddr in the skb during the neighbor look-up is different
1856                  * from the fl6->daddr used to look-up route here.
1857                  */
1858
1859                 struct rt6_info *uncached_rt;
1860
1861                 fib6_info_hold(f6i);
1862                 rcu_read_unlock();
1863
1864                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1865                 fib6_info_release(f6i);
1866
1867                 if (uncached_rt) {
1868                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1869                          * No need for another dst_hold()
1870                          */
1871                         rt6_uncached_list_add(uncached_rt);
1872                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1873                 } else {
1874                         uncached_rt = net->ipv6.ip6_null_entry;
1875                         dst_hold(&uncached_rt->dst);
1876                 }
1877
1878                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1879                 return uncached_rt;
1880
1881         } else {
1882                 /* Get a percpu copy */
1883
1884                 struct rt6_info *pcpu_rt;
1885
1886                 local_bh_disable();
1887                 pcpu_rt = rt6_get_pcpu_route(f6i);
1888
1889                 if (!pcpu_rt)
1890                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1891
1892                 local_bh_enable();
1893                 rcu_read_unlock();
1894                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1895                 return pcpu_rt;
1896         }
1897 }
1898 EXPORT_SYMBOL_GPL(ip6_pol_route);
1899
1900 static struct rt6_info *ip6_pol_route_input(struct net *net,
1901                                             struct fib6_table *table,
1902                                             struct flowi6 *fl6,
1903                                             const struct sk_buff *skb,
1904                                             int flags)
1905 {
1906         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1907 }
1908
1909 struct dst_entry *ip6_route_input_lookup(struct net *net,
1910                                          struct net_device *dev,
1911                                          struct flowi6 *fl6,
1912                                          const struct sk_buff *skb,
1913                                          int flags)
1914 {
1915         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1916                 flags |= RT6_LOOKUP_F_IFACE;
1917
1918         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1919 }
1920 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1921
1922 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1923                                   struct flow_keys *keys,
1924                                   struct flow_keys *flkeys)
1925 {
1926         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1927         const struct ipv6hdr *key_iph = outer_iph;
1928         struct flow_keys *_flkeys = flkeys;
1929         const struct ipv6hdr *inner_iph;
1930         const struct icmp6hdr *icmph;
1931         struct ipv6hdr _inner_iph;
1932
1933         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1934                 goto out;
1935
1936         icmph = icmp6_hdr(skb);
1937         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1938             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1939             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1940             icmph->icmp6_type != ICMPV6_PARAMPROB)
1941                 goto out;
1942
1943         inner_iph = skb_header_pointer(skb,
1944                                        skb_transport_offset(skb) + sizeof(*icmph),
1945                                        sizeof(_inner_iph), &_inner_iph);
1946         if (!inner_iph)
1947                 goto out;
1948
1949         key_iph = inner_iph;
1950         _flkeys = NULL;
1951 out:
1952         if (_flkeys) {
1953                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1954                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1955                 keys->tags.flow_label = _flkeys->tags.flow_label;
1956                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1957         } else {
1958                 keys->addrs.v6addrs.src = key_iph->saddr;
1959                 keys->addrs.v6addrs.dst = key_iph->daddr;
1960                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1961                 keys->basic.ip_proto = key_iph->nexthdr;
1962         }
1963 }
1964
1965 /* if skb is set it will be used and fl6 can be NULL */
1966 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1967                        const struct sk_buff *skb, struct flow_keys *flkeys)
1968 {
1969         struct flow_keys hash_keys;
1970         u32 mhash;
1971
1972         switch (ip6_multipath_hash_policy(net)) {
1973         case 0:
1974                 memset(&hash_keys, 0, sizeof(hash_keys));
1975                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1976                 if (skb) {
1977                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1978                 } else {
1979                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1980                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1981                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1982                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1983                 }
1984                 break;
1985         case 1:
1986                 if (skb) {
1987                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1988                         struct flow_keys keys;
1989
1990                         /* short-circuit if we already have L4 hash present */
1991                         if (skb->l4_hash)
1992                                 return skb_get_hash_raw(skb) >> 1;
1993
1994                         memset(&hash_keys, 0, sizeof(hash_keys));
1995
1996                         if (!flkeys) {
1997                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1998                                 flkeys = &keys;
1999                         }
2000                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2002                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2003                         hash_keys.ports.src = flkeys->ports.src;
2004                         hash_keys.ports.dst = flkeys->ports.dst;
2005                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2006                 } else {
2007                         memset(&hash_keys, 0, sizeof(hash_keys));
2008                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2010                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2011                         hash_keys.ports.src = fl6->fl6_sport;
2012                         hash_keys.ports.dst = fl6->fl6_dport;
2013                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2014                 }
2015                 break;
2016         }
2017         mhash = flow_hash_from_keys(&hash_keys);
2018
2019         return mhash >> 1;
2020 }
2021
2022 void ip6_route_input(struct sk_buff *skb)
2023 {
2024         const struct ipv6hdr *iph = ipv6_hdr(skb);
2025         struct net *net = dev_net(skb->dev);
2026         int flags = RT6_LOOKUP_F_HAS_SADDR;
2027         struct ip_tunnel_info *tun_info;
2028         struct flowi6 fl6 = {
2029                 .flowi6_iif = skb->dev->ifindex,
2030                 .daddr = iph->daddr,
2031                 .saddr = iph->saddr,
2032                 .flowlabel = ip6_flowinfo(iph),
2033                 .flowi6_mark = skb->mark,
2034                 .flowi6_proto = iph->nexthdr,
2035         };
2036         struct flow_keys *flkeys = NULL, _flkeys;
2037
2038         tun_info = skb_tunnel_info(skb);
2039         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2040                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2041
2042         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2043                 flkeys = &_flkeys;
2044
2045         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2046                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2047         skb_dst_drop(skb);
2048         skb_dst_set(skb,
2049                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2050 }
2051
2052 static struct rt6_info *ip6_pol_route_output(struct net *net,
2053                                              struct fib6_table *table,
2054                                              struct flowi6 *fl6,
2055                                              const struct sk_buff *skb,
2056                                              int flags)
2057 {
2058         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2059 }
2060
2061 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2062                                          struct flowi6 *fl6, int flags)
2063 {
2064         bool any_src;
2065
2066         if (rt6_need_strict(&fl6->daddr)) {
2067                 struct dst_entry *dst;
2068
2069                 dst = l3mdev_link_scope_lookup(net, fl6);
2070                 if (dst)
2071                         return dst;
2072         }
2073
2074         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2075
2076         any_src = ipv6_addr_any(&fl6->saddr);
2077         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2078             (fl6->flowi6_oif && any_src))
2079                 flags |= RT6_LOOKUP_F_IFACE;
2080
2081         if (!any_src)
2082                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2083         else if (sk)
2084                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2085
2086         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2087 }
2088 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2089
2090 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2091 {
2092         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2093         struct net_device *loopback_dev = net->loopback_dev;
2094         struct dst_entry *new = NULL;
2095
2096         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2097                        DST_OBSOLETE_DEAD, 0);
2098         if (rt) {
2099                 rt6_info_init(rt);
2100                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2101
2102                 new = &rt->dst;
2103                 new->__use = 1;
2104                 new->input = dst_discard;
2105                 new->output = dst_discard_out;
2106
2107                 dst_copy_metrics(new, &ort->dst);
2108
2109                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2110                 rt->rt6i_gateway = ort->rt6i_gateway;
2111                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2112
2113                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2114 #ifdef CONFIG_IPV6_SUBTREES
2115                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2116 #endif
2117         }
2118
2119         dst_release(dst_orig);
2120         return new ? new : ERR_PTR(-ENOMEM);
2121 }
2122
2123 /*
2124  *      Destination cache support functions
2125  */
2126
2127 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2128 {
2129         u32 rt_cookie = 0;
2130
2131         if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) ||
2132              rt_cookie != cookie)
2133                 return false;
2134
2135         if (fib6_check_expired(f6i))
2136                 return false;
2137
2138         return true;
2139 }
2140
2141 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2142 {
2143         u32 rt_cookie = 0;
2144
2145         if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
2146             rt_cookie != cookie)
2147                 return NULL;
2148
2149         if (rt6_check_expired(rt))
2150                 return NULL;
2151
2152         return &rt->dst;
2153 }
2154
2155 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2156 {
2157         if (!__rt6_check_expired(rt) &&
2158             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2159             fib6_check(rt->from, cookie))
2160                 return &rt->dst;
2161         else
2162                 return NULL;
2163 }
2164
2165 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2166 {
2167         struct rt6_info *rt;
2168
2169         rt = (struct rt6_info *) dst;
2170
2171         /* All IPV6 dsts are created with ->obsolete set to the value
2172          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2173          * into this function always.
2174          */
2175
2176         if (rt->rt6i_flags & RTF_PCPU ||
2177             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2178                 return rt6_dst_from_check(rt, cookie);
2179         else
2180                 return rt6_check(rt, cookie);
2181 }
2182
2183 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2184 {
2185         struct rt6_info *rt = (struct rt6_info *) dst;
2186
2187         if (rt) {
2188                 if (rt->rt6i_flags & RTF_CACHE) {
2189                         if (rt6_check_expired(rt)) {
2190                                 rt6_remove_exception_rt(rt);
2191                                 dst = NULL;
2192                         }
2193                 } else {
2194                         dst_release(dst);
2195                         dst = NULL;
2196                 }
2197         }
2198         return dst;
2199 }
2200
2201 static void ip6_link_failure(struct sk_buff *skb)
2202 {
2203         struct rt6_info *rt;
2204
2205         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2206
2207         rt = (struct rt6_info *) skb_dst(skb);
2208         if (rt) {
2209                 if (rt->rt6i_flags & RTF_CACHE) {
2210                         if (dst_hold_safe(&rt->dst))
2211                                 rt6_remove_exception_rt(rt);
2212                 } else if (rt->from) {
2213                         struct fib6_node *fn;
2214
2215                         rcu_read_lock();
2216                         fn = rcu_dereference(rt->from->fib6_node);
2217                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2218                                 fn->fn_sernum = -1;
2219                         rcu_read_unlock();
2220                 }
2221         }
2222 }
2223
2224 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2225 {
2226         if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
2227                 rt0->dst.expires = rt0->from->expires;
2228
2229         dst_set_expires(&rt0->dst, timeout);
2230         rt0->rt6i_flags |= RTF_EXPIRES;
2231 }
2232
2233 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2234 {
2235         struct net *net = dev_net(rt->dst.dev);
2236
2237         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2238         rt->rt6i_flags |= RTF_MODIFIED;
2239         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2240 }
2241
2242 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2243 {
2244         return !(rt->rt6i_flags & RTF_CACHE) &&
2245                 (rt->rt6i_flags & RTF_PCPU || rt->from);
2246 }
2247
2248 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2249                                  const struct ipv6hdr *iph, u32 mtu)
2250 {
2251         const struct in6_addr *daddr, *saddr;
2252         struct rt6_info *rt6 = (struct rt6_info *)dst;
2253
2254         if (rt6->rt6i_flags & RTF_LOCAL)
2255                 return;
2256
2257         if (dst_metric_locked(dst, RTAX_MTU))
2258                 return;
2259
2260         if (iph) {
2261                 daddr = &iph->daddr;
2262                 saddr = &iph->saddr;
2263         } else if (sk) {
2264                 daddr = &sk->sk_v6_daddr;
2265                 saddr = &inet6_sk(sk)->saddr;
2266         } else {
2267                 daddr = NULL;
2268                 saddr = NULL;
2269         }
2270         dst_confirm_neigh(dst, daddr);
2271         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2272         if (mtu >= dst_mtu(dst))
2273                 return;
2274
2275         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2276                 rt6_do_update_pmtu(rt6, mtu);
2277                 /* update rt6_ex->stamp for cache */
2278                 if (rt6->rt6i_flags & RTF_CACHE)
2279                         rt6_update_exception_stamp_rt(rt6);
2280         } else if (daddr) {
2281                 struct rt6_info *nrt6;
2282
2283                 nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2284                 if (nrt6) {
2285                         rt6_do_update_pmtu(nrt6, mtu);
2286                         if (rt6_insert_exception(nrt6, rt6->from))
2287                                 dst_release_immediate(&nrt6->dst);
2288                 }
2289         }
2290 }
2291
2292 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2293                                struct sk_buff *skb, u32 mtu)
2294 {
2295         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2296 }
2297
2298 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2299                      int oif, u32 mark, kuid_t uid)
2300 {
2301         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2302         struct dst_entry *dst;
2303         struct flowi6 fl6;
2304
2305         memset(&fl6, 0, sizeof(fl6));
2306         fl6.flowi6_oif = oif;
2307         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2308         fl6.daddr = iph->daddr;
2309         fl6.saddr = iph->saddr;
2310         fl6.flowlabel = ip6_flowinfo(iph);
2311         fl6.flowi6_uid = uid;
2312
2313         dst = ip6_route_output(net, NULL, &fl6);
2314         if (!dst->error)
2315                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2316         dst_release(dst);
2317 }
2318 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2319
2320 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2321 {
2322         struct dst_entry *dst;
2323
2324         ip6_update_pmtu(skb, sock_net(sk), mtu,
2325                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2326
2327         dst = __sk_dst_get(sk);
2328         if (!dst || !dst->obsolete ||
2329             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2330                 return;
2331
2332         bh_lock_sock(sk);
2333         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2334                 ip6_datagram_dst_update(sk, false);
2335         bh_unlock_sock(sk);
2336 }
2337 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2338
2339 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2340                            const struct flowi6 *fl6)
2341 {
2342 #ifdef CONFIG_IPV6_SUBTREES
2343         struct ipv6_pinfo *np = inet6_sk(sk);
2344 #endif
2345
2346         ip6_dst_store(sk, dst,
2347                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2348                       &sk->sk_v6_daddr : NULL,
2349 #ifdef CONFIG_IPV6_SUBTREES
2350                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2351                       &np->saddr :
2352 #endif
2353                       NULL);
2354 }
2355
2356 /* Handle redirects */
2357 struct ip6rd_flowi {
2358         struct flowi6 fl6;
2359         struct in6_addr gateway;
2360 };
2361
2362 static struct rt6_info *__ip6_route_redirect(struct net *net,
2363                                              struct fib6_table *table,
2364                                              struct flowi6 *fl6,
2365                                              const struct sk_buff *skb,
2366                                              int flags)
2367 {
2368         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2369         struct rt6_info *ret = NULL, *rt_cache;
2370         struct fib6_info *rt;
2371         struct fib6_node *fn;
2372
2373         /* Get the "current" route for this destination and
2374          * check if the redirect has come from appropriate router.
2375          *
2376          * RFC 4861 specifies that redirects should only be
2377          * accepted if they come from the nexthop to the target.
2378          * Due to the way the routes are chosen, this notion
2379          * is a bit fuzzy and one might need to check all possible
2380          * routes.
2381          */
2382
2383         rcu_read_lock();
2384         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2385 restart:
2386         for_each_fib6_node_rt_rcu(fn) {
2387                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2388                         continue;
2389                 if (fib6_check_expired(rt))
2390                         continue;
2391                 if (rt->fib6_flags & RTF_REJECT)
2392                         break;
2393                 if (!(rt->fib6_flags & RTF_GATEWAY))
2394                         continue;
2395                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2396                         continue;
2397                 /* rt_cache's gateway might be different from its 'parent'
2398                  * in the case of an ip redirect.
2399                  * So we keep searching in the exception table if the gateway
2400                  * is different.
2401                  */
2402                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2403                         rt_cache = rt6_find_cached_rt(rt,
2404                                                       &fl6->daddr,
2405                                                       &fl6->saddr);
2406                         if (rt_cache &&
2407                             ipv6_addr_equal(&rdfl->gateway,
2408                                             &rt_cache->rt6i_gateway)) {
2409                                 ret = rt_cache;
2410                                 break;
2411                         }
2412                         continue;
2413                 }
2414                 break;
2415         }
2416
2417         if (!rt)
2418                 rt = net->ipv6.fib6_null_entry;
2419         else if (rt->fib6_flags & RTF_REJECT) {
2420                 ret = net->ipv6.ip6_null_entry;
2421                 goto out;
2422         }
2423
2424         if (rt == net->ipv6.fib6_null_entry) {
2425                 fn = fib6_backtrack(fn, &fl6->saddr);
2426                 if (fn)
2427                         goto restart;
2428         }
2429
2430 out:
2431         if (ret)
2432                 dst_hold(&ret->dst);
2433         else
2434                 ret = ip6_create_rt_rcu(rt);
2435
2436         rcu_read_unlock();
2437
2438         trace_fib6_table_lookup(net, ret, table, fl6);
2439         return ret;
2440 };
2441
2442 static struct dst_entry *ip6_route_redirect(struct net *net,
2443                                             const struct flowi6 *fl6,
2444                                             const struct sk_buff *skb,
2445                                             const struct in6_addr *gateway)
2446 {
2447         int flags = RT6_LOOKUP_F_HAS_SADDR;
2448         struct ip6rd_flowi rdfl;
2449
2450         rdfl.fl6 = *fl6;
2451         rdfl.gateway = *gateway;
2452
2453         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2454                                 flags, __ip6_route_redirect);
2455 }
2456
2457 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2458                   kuid_t uid)
2459 {
2460         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2461         struct dst_entry *dst;
2462         struct flowi6 fl6;
2463
2464         memset(&fl6, 0, sizeof(fl6));
2465         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2466         fl6.flowi6_oif = oif;
2467         fl6.flowi6_mark = mark;
2468         fl6.daddr = iph->daddr;
2469         fl6.saddr = iph->saddr;
2470         fl6.flowlabel = ip6_flowinfo(iph);
2471         fl6.flowi6_uid = uid;
2472
2473         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2474         rt6_do_redirect(dst, NULL, skb);
2475         dst_release(dst);
2476 }
2477 EXPORT_SYMBOL_GPL(ip6_redirect);
2478
2479 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2480                             u32 mark)
2481 {
2482         const struct ipv6hdr *iph = ipv6_hdr(skb);
2483         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2484         struct dst_entry *dst;
2485         struct flowi6 fl6;
2486
2487         memset(&fl6, 0, sizeof(fl6));
2488         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2489         fl6.flowi6_oif = oif;
2490         fl6.flowi6_mark = mark;
2491         fl6.daddr = msg->dest;
2492         fl6.saddr = iph->daddr;
2493         fl6.flowi6_uid = sock_net_uid(net, NULL);
2494
2495         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2496         rt6_do_redirect(dst, NULL, skb);
2497         dst_release(dst);
2498 }
2499
2500 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2501 {
2502         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2503                      sk->sk_uid);
2504 }
2505 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2506
2507 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2508 {
2509         struct net_device *dev = dst->dev;
2510         unsigned int mtu = dst_mtu(dst);
2511         struct net *net = dev_net(dev);
2512
2513         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2514
2515         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2516                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2517
2518         /*
2519          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2520          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2521          * IPV6_MAXPLEN is also valid and means: "any MSS,
2522          * rely only on pmtu discovery"
2523          */
2524         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2525                 mtu = IPV6_MAXPLEN;
2526         return mtu;
2527 }
2528
2529 static unsigned int ip6_mtu(const struct dst_entry *dst)
2530 {
2531         struct inet6_dev *idev;
2532         unsigned int mtu;
2533
2534         mtu = dst_metric_raw(dst, RTAX_MTU);
2535         if (mtu)
2536                 goto out;
2537
2538         mtu = IPV6_MIN_MTU;
2539
2540         rcu_read_lock();
2541         idev = __in6_dev_get(dst->dev);
2542         if (idev)
2543                 mtu = idev->cnf.mtu6;
2544         rcu_read_unlock();
2545
2546 out:
2547         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2548
2549         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2550 }
2551
2552 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2553                                   struct flowi6 *fl6)
2554 {
2555         struct dst_entry *dst;
2556         struct rt6_info *rt;
2557         struct inet6_dev *idev = in6_dev_get(dev);
2558         struct net *net = dev_net(dev);
2559
2560         if (unlikely(!idev))
2561                 return ERR_PTR(-ENODEV);
2562
2563         rt = ip6_dst_alloc(net, dev, 0);
2564         if (unlikely(!rt)) {
2565                 in6_dev_put(idev);
2566                 dst = ERR_PTR(-ENOMEM);
2567                 goto out;
2568         }
2569
2570         rt->dst.flags |= DST_HOST;
2571         rt->dst.input = ip6_input;
2572         rt->dst.output  = ip6_output;
2573         rt->rt6i_gateway  = fl6->daddr;
2574         rt->rt6i_dst.addr = fl6->daddr;
2575         rt->rt6i_dst.plen = 128;
2576         rt->rt6i_idev     = idev;
2577         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2578
2579         /* Add this dst into uncached_list so that rt6_disable_ip() can
2580          * do proper release of the net_device
2581          */
2582         rt6_uncached_list_add(rt);
2583         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2584
2585         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2586
2587 out:
2588         return dst;
2589 }
2590
2591 static int ip6_dst_gc(struct dst_ops *ops)
2592 {
2593         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2594         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2595         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2596         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2597         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2598         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2599         int entries;
2600
2601         entries = dst_entries_get_fast(ops);
2602         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2603             entries <= rt_max_size)
2604                 goto out;
2605
2606         net->ipv6.ip6_rt_gc_expire++;
2607         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2608         entries = dst_entries_get_slow(ops);
2609         if (entries < ops->gc_thresh)
2610                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2611 out:
2612         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2613         return entries > rt_max_size;
2614 }
2615
2616 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2617                                struct fib6_config *cfg)
2618 {
2619         struct dst_metrics *p;
2620
2621         if (!cfg->fc_mx)
2622                 return 0;
2623
2624         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2625         if (unlikely(!p))
2626                 return -ENOMEM;
2627
2628         refcount_set(&p->refcnt, 1);
2629         rt->fib6_metrics = p;
2630
2631         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2632 }
2633
2634 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2635                                             struct fib6_config *cfg,
2636                                             const struct in6_addr *gw_addr,
2637                                             u32 tbid, int flags)
2638 {
2639         struct flowi6 fl6 = {
2640                 .flowi6_oif = cfg->fc_ifindex,
2641                 .daddr = *gw_addr,
2642                 .saddr = cfg->fc_prefsrc,
2643         };
2644         struct fib6_table *table;
2645         struct rt6_info *rt;
2646
2647         table = fib6_get_table(net, tbid);
2648         if (!table)
2649                 return NULL;
2650
2651         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2652                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2653
2654         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2655         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2656
2657         /* if table lookup failed, fall back to full lookup */
2658         if (rt == net->ipv6.ip6_null_entry) {
2659                 ip6_rt_put(rt);
2660                 rt = NULL;
2661         }
2662
2663         return rt;
2664 }
2665
2666 static int ip6_route_check_nh_onlink(struct net *net,
2667                                      struct fib6_config *cfg,
2668                                      const struct net_device *dev,
2669                                      struct netlink_ext_ack *extack)
2670 {
2671         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2672         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2673         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2674         struct rt6_info *grt;
2675         int err;
2676
2677         err = 0;
2678         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2679         if (grt) {
2680                 if (!grt->dst.error &&
2681                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2682                         NL_SET_ERR_MSG(extack,
2683                                        "Nexthop has invalid gateway or device mismatch");
2684                         err = -EINVAL;
2685                 }
2686
2687                 ip6_rt_put(grt);
2688         }
2689
2690         return err;
2691 }
2692
2693 static int ip6_route_check_nh(struct net *net,
2694                               struct fib6_config *cfg,
2695                               struct net_device **_dev,
2696                               struct inet6_dev **idev)
2697 {
2698         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2699         struct net_device *dev = _dev ? *_dev : NULL;
2700         struct rt6_info *grt = NULL;
2701         int err = -EHOSTUNREACH;
2702
2703         if (cfg->fc_table) {
2704                 int flags = RT6_LOOKUP_F_IFACE;
2705
2706                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2707                                           cfg->fc_table, flags);
2708                 if (grt) {
2709                         if (grt->rt6i_flags & RTF_GATEWAY ||
2710                             (dev && dev != grt->dst.dev)) {
2711                                 ip6_rt_put(grt);
2712                                 grt = NULL;
2713                         }
2714                 }
2715         }
2716
2717         if (!grt)
2718                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2719
2720         if (!grt)
2721                 goto out;
2722
2723         if (dev) {
2724                 if (dev != grt->dst.dev) {
2725                         ip6_rt_put(grt);
2726                         goto out;
2727                 }
2728         } else {
2729                 *_dev = dev = grt->dst.dev;
2730                 *idev = grt->rt6i_idev;
2731                 dev_hold(dev);
2732                 in6_dev_hold(grt->rt6i_idev);
2733         }
2734
2735         if (!(grt->rt6i_flags & RTF_GATEWAY))
2736                 err = 0;
2737
2738         ip6_rt_put(grt);
2739
2740 out:
2741         return err;
2742 }
2743
2744 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2745                            struct net_device **_dev, struct inet6_dev **idev,
2746                            struct netlink_ext_ack *extack)
2747 {
2748         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2749         int gwa_type = ipv6_addr_type(gw_addr);
2750         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2751         const struct net_device *dev = *_dev;
2752         bool need_addr_check = !dev;
2753         int err = -EINVAL;
2754
2755         /* if gw_addr is local we will fail to detect this in case
2756          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2757          * will return already-added prefix route via interface that
2758          * prefix route was assigned to, which might be non-loopback.
2759          */
2760         if (dev &&
2761             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2762                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2763                 goto out;
2764         }
2765
2766         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2767                 /* IPv6 strictly inhibits using not link-local
2768                  * addresses as nexthop address.
2769                  * Otherwise, router will not able to send redirects.
2770                  * It is very good, but in some (rare!) circumstances
2771                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2772                  * some exceptions. --ANK
2773                  * We allow IPv4-mapped nexthops to support RFC4798-type
2774                  * addressing
2775                  */
2776                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2777                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2778                         goto out;
2779                 }
2780
2781                 if (cfg->fc_flags & RTNH_F_ONLINK)
2782                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2783                 else
2784                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2785
2786                 if (err)
2787                         goto out;
2788         }
2789
2790         /* reload in case device was changed */
2791         dev = *_dev;
2792
2793         err = -EINVAL;
2794         if (!dev) {
2795                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2796                 goto out;
2797         } else if (dev->flags & IFF_LOOPBACK) {
2798                 NL_SET_ERR_MSG(extack,
2799                                "Egress device can not be loopback device for this route");
2800                 goto out;
2801         }
2802
2803         /* if we did not check gw_addr above, do so now that the
2804          * egress device has been resolved.
2805          */
2806         if (need_addr_check &&
2807             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2808                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2809                 goto out;
2810         }
2811
2812         err = 0;
2813 out:
2814         return err;
2815 }
2816
2817 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2818                                               gfp_t gfp_flags,
2819                                               struct netlink_ext_ack *extack)
2820 {
2821         struct net *net = cfg->fc_nlinfo.nl_net;
2822         struct fib6_info *rt = NULL;
2823         struct net_device *dev = NULL;
2824         struct inet6_dev *idev = NULL;
2825         struct fib6_table *table;
2826         int addr_type;
2827         int err = -EINVAL;
2828
2829         /* RTF_PCPU is an internal flag; can not be set by userspace */
2830         if (cfg->fc_flags & RTF_PCPU) {
2831                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2832                 goto out;
2833         }
2834
2835         /* RTF_CACHE is an internal flag; can not be set by userspace */
2836         if (cfg->fc_flags & RTF_CACHE) {
2837                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2838                 goto out;
2839         }
2840
2841         if (cfg->fc_type > RTN_MAX) {
2842                 NL_SET_ERR_MSG(extack, "Invalid route type");
2843                 goto out;
2844         }
2845
2846         if (cfg->fc_dst_len > 128) {
2847                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2848                 goto out;
2849         }
2850         if (cfg->fc_src_len > 128) {
2851                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2852                 goto out;
2853         }
2854 #ifndef CONFIG_IPV6_SUBTREES
2855         if (cfg->fc_src_len) {
2856                 NL_SET_ERR_MSG(extack,
2857                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2858                 goto out;
2859         }
2860 #endif
2861         if (cfg->fc_ifindex) {
2862                 err = -ENODEV;
2863                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2864                 if (!dev)
2865                         goto out;
2866                 idev = in6_dev_get(dev);
2867                 if (!idev)
2868                         goto out;
2869         }
2870
2871         if (cfg->fc_metric == 0)
2872                 cfg->fc_metric = IP6_RT_PRIO_USER;
2873
2874         if (cfg->fc_flags & RTNH_F_ONLINK) {
2875                 if (!dev) {
2876                         NL_SET_ERR_MSG(extack,
2877                                        "Nexthop device required for onlink");
2878                         err = -ENODEV;
2879                         goto out;
2880                 }
2881
2882                 if (!(dev->flags & IFF_UP)) {
2883                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2884                         err = -ENETDOWN;
2885                         goto out;
2886                 }
2887         }
2888
2889         err = -ENOBUFS;
2890         if (cfg->fc_nlinfo.nlh &&
2891             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2892                 table = fib6_get_table(net, cfg->fc_table);
2893                 if (!table) {
2894                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2895                         table = fib6_new_table(net, cfg->fc_table);
2896                 }
2897         } else {
2898                 table = fib6_new_table(net, cfg->fc_table);
2899         }
2900
2901         if (!table)
2902                 goto out;
2903
2904         err = -ENOMEM;
2905         rt = fib6_info_alloc(gfp_flags);
2906         if (!rt)
2907                 goto out;
2908
2909         if (cfg->fc_flags & RTF_ADDRCONF)
2910                 rt->dst_nocount = true;
2911
2912         err = ip6_convert_metrics(net, rt, cfg);
2913         if (err < 0)
2914                 goto out;
2915
2916         if (cfg->fc_flags & RTF_EXPIRES)
2917                 fib6_set_expires(rt, jiffies +
2918                                 clock_t_to_jiffies(cfg->fc_expires));
2919         else
2920                 fib6_clean_expires(rt);
2921
2922         if (cfg->fc_protocol == RTPROT_UNSPEC)
2923                 cfg->fc_protocol = RTPROT_BOOT;
2924         rt->fib6_protocol = cfg->fc_protocol;
2925
2926         addr_type = ipv6_addr_type(&cfg->fc_dst);
2927
2928         if (cfg->fc_encap) {
2929                 struct lwtunnel_state *lwtstate;
2930
2931                 err = lwtunnel_build_state(cfg->fc_encap_type,
2932                                            cfg->fc_encap, AF_INET6, cfg,
2933                                            &lwtstate, extack);
2934                 if (err)
2935                         goto out;
2936                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2937         }
2938
2939         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2940         rt->fib6_dst.plen = cfg->fc_dst_len;
2941         if (rt->fib6_dst.plen == 128)
2942                 rt->dst_host = true;
2943
2944 #ifdef CONFIG_IPV6_SUBTREES
2945         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2946         rt->fib6_src.plen = cfg->fc_src_len;
2947 #endif
2948
2949         rt->fib6_metric = cfg->fc_metric;
2950         rt->fib6_nh.nh_weight = 1;
2951
2952         rt->fib6_type = cfg->fc_type;
2953
2954         /* We cannot add true routes via loopback here,
2955            they would result in kernel looping; promote them to reject routes
2956          */
2957         if ((cfg->fc_flags & RTF_REJECT) ||
2958             (dev && (dev->flags & IFF_LOOPBACK) &&
2959              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2960              !(cfg->fc_flags & RTF_LOCAL))) {
2961                 /* hold loopback dev/idev if we haven't done so. */
2962                 if (dev != net->loopback_dev) {
2963                         if (dev) {
2964                                 dev_put(dev);
2965                                 in6_dev_put(idev);
2966                         }
2967                         dev = net->loopback_dev;
2968                         dev_hold(dev);
2969                         idev = in6_dev_get(dev);
2970                         if (!idev) {
2971                                 err = -ENODEV;
2972                                 goto out;
2973                         }
2974                 }
2975                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
2976                 goto install_route;
2977         }
2978
2979         if (cfg->fc_flags & RTF_GATEWAY) {
2980                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2981                 if (err)
2982                         goto out;
2983
2984                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
2985         }
2986
2987         err = -ENODEV;
2988         if (!dev)
2989                 goto out;
2990
2991         if (idev->cnf.disable_ipv6) {
2992                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2993                 err = -EACCES;
2994                 goto out;
2995         }
2996
2997         if (!(dev->flags & IFF_UP)) {
2998                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2999                 err = -ENETDOWN;
3000                 goto out;
3001         }
3002
3003         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3004                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3005                         NL_SET_ERR_MSG(extack, "Invalid source address");
3006                         err = -EINVAL;
3007                         goto out;
3008                 }
3009                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3010                 rt->fib6_prefsrc.plen = 128;
3011         } else
3012                 rt->fib6_prefsrc.plen = 0;
3013
3014         rt->fib6_flags = cfg->fc_flags;
3015
3016 install_route:
3017         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3018             !netif_carrier_ok(dev))
3019                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3020         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3021         rt->fib6_nh.nh_dev = dev;
3022         rt->fib6_table = table;
3023
3024         cfg->fc_nlinfo.nl_net = dev_net(dev);
3025
3026         if (idev)
3027                 in6_dev_put(idev);
3028
3029         return rt;
3030 out:
3031         if (dev)
3032                 dev_put(dev);
3033         if (idev)
3034                 in6_dev_put(idev);
3035
3036         fib6_info_release(rt);
3037         return ERR_PTR(err);
3038 }
3039
3040 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3041                   struct netlink_ext_ack *extack)
3042 {
3043         struct fib6_info *rt;
3044         int err;
3045
3046         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3047         if (IS_ERR(rt))
3048                 return PTR_ERR(rt);
3049
3050         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3051         fib6_info_release(rt);
3052
3053         return err;
3054 }
3055
3056 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3057 {
3058         struct net *net = info->nl_net;
3059         struct fib6_table *table;
3060         int err;
3061
3062         if (rt == net->ipv6.fib6_null_entry) {
3063                 err = -ENOENT;
3064                 goto out;
3065         }
3066
3067         table = rt->fib6_table;
3068         spin_lock_bh(&table->tb6_lock);
3069         err = fib6_del(rt, info);
3070         spin_unlock_bh(&table->tb6_lock);
3071
3072 out:
3073         fib6_info_release(rt);
3074         return err;
3075 }
3076
3077 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3078 {
3079         struct nl_info info = { .nl_net = net };
3080
3081         return __ip6_del_rt(rt, &info);
3082 }
3083
3084 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3085 {
3086         struct nl_info *info = &cfg->fc_nlinfo;
3087         struct net *net = info->nl_net;
3088         struct sk_buff *skb = NULL;
3089         struct fib6_table *table;
3090         int err = -ENOENT;
3091
3092         if (rt == net->ipv6.fib6_null_entry)
3093                 goto out_put;
3094         table = rt->fib6_table;
3095         spin_lock_bh(&table->tb6_lock);
3096
3097         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3098                 struct fib6_info *sibling, *next_sibling;
3099
3100                 /* prefer to send a single notification with all hops */
3101                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3102                 if (skb) {
3103                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3104
3105                         if (rt6_fill_node(net, skb, rt, NULL,
3106                                           NULL, NULL, 0, RTM_DELROUTE,
3107                                           info->portid, seq, 0) < 0) {
3108                                 kfree_skb(skb);
3109                                 skb = NULL;
3110                         } else
3111                                 info->skip_notify = 1;
3112                 }
3113
3114                 list_for_each_entry_safe(sibling, next_sibling,
3115                                          &rt->fib6_siblings,
3116                                          fib6_siblings) {
3117                         err = fib6_del(sibling, info);
3118                         if (err)
3119                                 goto out_unlock;
3120                 }
3121         }
3122
3123         err = fib6_del(rt, info);
3124 out_unlock:
3125         spin_unlock_bh(&table->tb6_lock);
3126 out_put:
3127         fib6_info_release(rt);
3128
3129         if (skb) {
3130                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3131                             info->nlh, gfp_any());
3132         }
3133         return err;
3134 }
3135
3136 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3137 {
3138         int rc = -ESRCH;
3139
3140         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3141                 goto out;
3142
3143         if (cfg->fc_flags & RTF_GATEWAY &&
3144             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3145                 goto out;
3146         if (dst_hold_safe(&rt->dst))
3147                 rc = rt6_remove_exception_rt(rt);
3148 out:
3149         return rc;
3150 }
3151
3152 static int ip6_route_del(struct fib6_config *cfg,
3153                          struct netlink_ext_ack *extack)
3154 {
3155         struct rt6_info *rt_cache;
3156         struct fib6_table *table;
3157         struct fib6_info *rt;
3158         struct fib6_node *fn;
3159         int err = -ESRCH;
3160
3161         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3162         if (!table) {
3163                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3164                 return err;
3165         }
3166
3167         rcu_read_lock();
3168
3169         fn = fib6_locate(&table->tb6_root,
3170                          &cfg->fc_dst, cfg->fc_dst_len,
3171                          &cfg->fc_src, cfg->fc_src_len,
3172                          !(cfg->fc_flags & RTF_CACHE));
3173
3174         if (fn) {
3175                 for_each_fib6_node_rt_rcu(fn) {
3176                         if (cfg->fc_flags & RTF_CACHE) {
3177                                 int rc;
3178
3179                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3180                                                               &cfg->fc_src);
3181                                 if (rt_cache) {
3182                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3183                                         if (rc != -ESRCH)
3184                                                 return rc;
3185                                 }
3186                                 continue;
3187                         }
3188                         if (cfg->fc_ifindex &&
3189                             (!rt->fib6_nh.nh_dev ||
3190                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3191                                 continue;
3192                         if (cfg->fc_flags & RTF_GATEWAY &&
3193                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3194                                 continue;
3195                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3196                                 continue;
3197                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3198                                 continue;
3199                         fib6_info_hold(rt);
3200                         rcu_read_unlock();
3201
3202                         /* if gateway was specified only delete the one hop */
3203                         if (cfg->fc_flags & RTF_GATEWAY)
3204                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3205
3206                         return __ip6_del_rt_siblings(rt, cfg);
3207                 }
3208         }
3209         rcu_read_unlock();
3210
3211         return err;
3212 }
3213
3214 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3215 {
3216         struct netevent_redirect netevent;
3217         struct rt6_info *rt, *nrt = NULL;
3218         struct ndisc_options ndopts;
3219         struct inet6_dev *in6_dev;
3220         struct neighbour *neigh;
3221         struct rd_msg *msg;
3222         int optlen, on_link;
3223         u8 *lladdr;
3224
3225         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3226         optlen -= sizeof(*msg);
3227
3228         if (optlen < 0) {
3229                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3230                 return;
3231         }
3232
3233         msg = (struct rd_msg *)icmp6_hdr(skb);
3234
3235         if (ipv6_addr_is_multicast(&msg->dest)) {
3236                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3237                 return;
3238         }
3239
3240         on_link = 0;
3241         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3242                 on_link = 1;
3243         } else if (ipv6_addr_type(&msg->target) !=
3244                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3245                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3246                 return;
3247         }
3248
3249         in6_dev = __in6_dev_get(skb->dev);
3250         if (!in6_dev)
3251                 return;
3252         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3253                 return;
3254
3255         /* RFC2461 8.1:
3256          *      The IP source address of the Redirect MUST be the same as the current
3257          *      first-hop router for the specified ICMP Destination Address.
3258          */
3259
3260         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3261                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3262                 return;
3263         }
3264
3265         lladdr = NULL;
3266         if (ndopts.nd_opts_tgt_lladdr) {
3267                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3268                                              skb->dev);
3269                 if (!lladdr) {
3270                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3271                         return;
3272                 }
3273         }
3274
3275         rt = (struct rt6_info *) dst;
3276         if (rt->rt6i_flags & RTF_REJECT) {
3277                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3278                 return;
3279         }
3280
3281         /* Redirect received -> path was valid.
3282          * Look, redirects are sent only in response to data packets,
3283          * so that this nexthop apparently is reachable. --ANK
3284          */
3285         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3286
3287         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3288         if (!neigh)
3289                 return;
3290
3291         /*
3292          *      We have finally decided to accept it.
3293          */
3294
3295         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3296                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3297                      NEIGH_UPDATE_F_OVERRIDE|
3298                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3299                                      NEIGH_UPDATE_F_ISROUTER)),
3300                      NDISC_REDIRECT, &ndopts);
3301
3302         nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
3303         if (!nrt)
3304                 goto out;
3305
3306         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3307         if (on_link)
3308                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3309
3310         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3311
3312         /* No need to remove rt from the exception table if rt is
3313          * a cached route because rt6_insert_exception() will
3314          * takes care of it
3315          */
3316         if (rt6_insert_exception(nrt, rt->from)) {
3317                 dst_release_immediate(&nrt->dst);
3318                 goto out;
3319         }
3320
3321         netevent.old = &rt->dst;
3322         netevent.new = &nrt->dst;
3323         netevent.daddr = &msg->dest;
3324         netevent.neigh = neigh;
3325         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3326
3327 out:
3328         neigh_release(neigh);
3329 }
3330
3331 #ifdef CONFIG_IPV6_ROUTE_INFO
3332 static struct fib6_info *rt6_get_route_info(struct net *net,
3333                                            const struct in6_addr *prefix, int prefixlen,
3334                                            const struct in6_addr *gwaddr,
3335                                            struct net_device *dev)
3336 {
3337         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3338         int ifindex = dev->ifindex;
3339         struct fib6_node *fn;
3340         struct fib6_info *rt = NULL;
3341         struct fib6_table *table;
3342
3343         table = fib6_get_table(net, tb_id);
3344         if (!table)
3345                 return NULL;
3346
3347         rcu_read_lock();
3348         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3349         if (!fn)
3350                 goto out;
3351
3352         for_each_fib6_node_rt_rcu(fn) {
3353                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3354                         continue;
3355                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3356                         continue;
3357                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3358                         continue;
3359                 fib6_info_hold(rt);
3360                 break;
3361         }
3362 out:
3363         rcu_read_unlock();
3364         return rt;
3365 }
3366
3367 static struct fib6_info *rt6_add_route_info(struct net *net,
3368                                            const struct in6_addr *prefix, int prefixlen,
3369                                            const struct in6_addr *gwaddr,
3370                                            struct net_device *dev,
3371                                            unsigned int pref)
3372 {
3373         struct fib6_config cfg = {
3374                 .fc_metric      = IP6_RT_PRIO_USER,
3375                 .fc_ifindex     = dev->ifindex,
3376                 .fc_dst_len     = prefixlen,
3377                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3378                                   RTF_UP | RTF_PREF(pref),
3379                 .fc_protocol = RTPROT_RA,
3380                 .fc_type = RTN_UNICAST,
3381                 .fc_nlinfo.portid = 0,
3382                 .fc_nlinfo.nlh = NULL,
3383                 .fc_nlinfo.nl_net = net,
3384         };
3385
3386         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3387         cfg.fc_dst = *prefix;
3388         cfg.fc_gateway = *gwaddr;
3389
3390         /* We should treat it as a default route if prefix length is 0. */
3391         if (!prefixlen)
3392                 cfg.fc_flags |= RTF_DEFAULT;
3393
3394         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3395
3396         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3397 }
3398 #endif
3399
3400 struct fib6_info *rt6_get_dflt_router(struct net *net,
3401                                      const struct in6_addr *addr,
3402                                      struct net_device *dev)
3403 {
3404         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3405         struct fib6_info *rt;
3406         struct fib6_table *table;
3407
3408         table = fib6_get_table(net, tb_id);
3409         if (!table)
3410                 return NULL;
3411
3412         rcu_read_lock();
3413         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3414                 if (dev == rt->fib6_nh.nh_dev &&
3415                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3416                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3417                         break;
3418         }
3419         if (rt)
3420                 fib6_info_hold(rt);
3421         rcu_read_unlock();
3422         return rt;
3423 }
3424
3425 struct fib6_info *rt6_add_dflt_router(struct net *net,
3426                                      const struct in6_addr *gwaddr,
3427                                      struct net_device *dev,
3428                                      unsigned int pref)
3429 {
3430         struct fib6_config cfg = {
3431                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3432                 .fc_metric      = IP6_RT_PRIO_USER,
3433                 .fc_ifindex     = dev->ifindex,
3434                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3435                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3436                 .fc_protocol = RTPROT_RA,
3437                 .fc_type = RTN_UNICAST,
3438                 .fc_nlinfo.portid = 0,
3439                 .fc_nlinfo.nlh = NULL,
3440                 .fc_nlinfo.nl_net = net,
3441         };
3442
3443         cfg.fc_gateway = *gwaddr;
3444
3445         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3446                 struct fib6_table *table;
3447
3448                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3449                 if (table)
3450                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3451         }
3452
3453         return rt6_get_dflt_router(net, gwaddr, dev);
3454 }
3455
3456 static void __rt6_purge_dflt_routers(struct net *net,
3457                                      struct fib6_table *table)
3458 {
3459         struct fib6_info *rt;
3460
3461 restart:
3462         rcu_read_lock();
3463         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3464                 struct net_device *dev = fib6_info_nh_dev(rt);
3465                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3466
3467                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3468                     (!idev || idev->cnf.accept_ra != 2)) {
3469                         fib6_info_hold(rt);
3470                         rcu_read_unlock();
3471                         ip6_del_rt(net, rt);
3472                         goto restart;
3473                 }
3474         }
3475         rcu_read_unlock();
3476
3477         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3478 }
3479
3480 void rt6_purge_dflt_routers(struct net *net)
3481 {
3482         struct fib6_table *table;
3483         struct hlist_head *head;
3484         unsigned int h;
3485
3486         rcu_read_lock();
3487
3488         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3489                 head = &net->ipv6.fib_table_hash[h];
3490                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3491                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3492                                 __rt6_purge_dflt_routers(net, table);
3493                 }
3494         }
3495
3496         rcu_read_unlock();
3497 }
3498
3499 static void rtmsg_to_fib6_config(struct net *net,
3500                                  struct in6_rtmsg *rtmsg,
3501                                  struct fib6_config *cfg)
3502 {
3503         memset(cfg, 0, sizeof(*cfg));
3504
3505         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3506                          : RT6_TABLE_MAIN;
3507         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3508         cfg->fc_metric = rtmsg->rtmsg_metric;
3509         cfg->fc_expires = rtmsg->rtmsg_info;
3510         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3511         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3512         cfg->fc_flags = rtmsg->rtmsg_flags;
3513         cfg->fc_type = rtmsg->rtmsg_type;
3514
3515         cfg->fc_nlinfo.nl_net = net;
3516
3517         cfg->fc_dst = rtmsg->rtmsg_dst;
3518         cfg->fc_src = rtmsg->rtmsg_src;
3519         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3520 }
3521
3522 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3523 {
3524         struct fib6_config cfg;
3525         struct in6_rtmsg rtmsg;
3526         int err;
3527
3528         switch (cmd) {
3529         case SIOCADDRT:         /* Add a route */
3530         case SIOCDELRT:         /* Delete a route */
3531                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3532                         return -EPERM;
3533                 err = copy_from_user(&rtmsg, arg,
3534                                      sizeof(struct in6_rtmsg));
3535                 if (err)
3536                         return -EFAULT;
3537
3538                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3539
3540                 rtnl_lock();
3541                 switch (cmd) {
3542                 case SIOCADDRT:
3543                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3544                         break;
3545                 case SIOCDELRT:
3546                         err = ip6_route_del(&cfg, NULL);
3547                         break;
3548                 default:
3549                         err = -EINVAL;
3550                 }
3551                 rtnl_unlock();
3552
3553                 return err;
3554         }
3555
3556         return -EINVAL;
3557 }
3558
3559 /*
3560  *      Drop the packet on the floor
3561  */
3562
3563 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3564 {
3565         int type;
3566         struct dst_entry *dst = skb_dst(skb);
3567         switch (ipstats_mib_noroutes) {
3568         case IPSTATS_MIB_INNOROUTES:
3569                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3570                 if (type == IPV6_ADDR_ANY) {
3571                         IP6_INC_STATS(dev_net(dst->dev),
3572                                       __in6_dev_get_safely(skb->dev),
3573                                       IPSTATS_MIB_INADDRERRORS);
3574                         break;
3575                 }
3576                 /* FALLTHROUGH */
3577         case IPSTATS_MIB_OUTNOROUTES:
3578                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3579                               ipstats_mib_noroutes);
3580                 break;
3581         }
3582         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3583         kfree_skb(skb);
3584         return 0;
3585 }
3586
3587 static int ip6_pkt_discard(struct sk_buff *skb)
3588 {
3589         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3590 }
3591
3592 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3593 {
3594         skb->dev = skb_dst(skb)->dev;
3595         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3596 }
3597
3598 static int ip6_pkt_prohibit(struct sk_buff *skb)
3599 {
3600         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3601 }
3602
3603 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3604 {
3605         skb->dev = skb_dst(skb)->dev;
3606         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3607 }
3608
3609 /*
3610  *      Allocate a dst for local (unicast / anycast) address.
3611  */
3612
3613 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3614                                      struct inet6_dev *idev,
3615                                      const struct in6_addr *addr,
3616                                      bool anycast, gfp_t gfp_flags)
3617 {
3618         u32 tb_id;
3619         struct net_device *dev = idev->dev;
3620         struct fib6_info *f6i;
3621
3622         f6i = fib6_info_alloc(gfp_flags);
3623         if (!f6i)
3624                 return ERR_PTR(-ENOMEM);
3625
3626         f6i->dst_nocount = true;
3627         f6i->dst_host = true;
3628         f6i->fib6_protocol = RTPROT_KERNEL;
3629         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3630         if (anycast) {
3631                 f6i->fib6_type = RTN_ANYCAST;
3632                 f6i->fib6_flags |= RTF_ANYCAST;
3633         } else {
3634                 f6i->fib6_type = RTN_LOCAL;
3635                 f6i->fib6_flags |= RTF_LOCAL;
3636         }
3637
3638         f6i->fib6_nh.nh_gw = *addr;
3639         dev_hold(dev);
3640         f6i->fib6_nh.nh_dev = dev;
3641         f6i->fib6_dst.addr = *addr;
3642         f6i->fib6_dst.plen = 128;
3643         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3644         f6i->fib6_table = fib6_get_table(net, tb_id);
3645
3646         return f6i;
3647 }
3648
3649 /* remove deleted ip from prefsrc entries */
3650 struct arg_dev_net_ip {
3651         struct net_device *dev;
3652         struct net *net;
3653         struct in6_addr *addr;
3654 };
3655
3656 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3657 {
3658         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3659         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3660         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3661
3662         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3663             rt != net->ipv6.fib6_null_entry &&
3664             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3665                 spin_lock_bh(&rt6_exception_lock);
3666                 /* remove prefsrc entry */
3667                 rt->fib6_prefsrc.plen = 0;
3668                 /* need to update cache as well */
3669                 rt6_exceptions_remove_prefsrc(rt);
3670                 spin_unlock_bh(&rt6_exception_lock);
3671         }
3672         return 0;
3673 }
3674
3675 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3676 {
3677         struct net *net = dev_net(ifp->idev->dev);
3678         struct arg_dev_net_ip adni = {
3679                 .dev = ifp->idev->dev,
3680                 .net = net,
3681                 .addr = &ifp->addr,
3682         };
3683         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3684 }
3685
3686 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3687
3688 /* Remove routers and update dst entries when gateway turn into host. */
3689 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3690 {
3691         struct in6_addr *gateway = (struct in6_addr *)arg;
3692
3693         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3694             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3695                 return -1;
3696         }
3697
3698         /* Further clean up cached routes in exception table.
3699          * This is needed because cached route may have a different
3700          * gateway than its 'parent' in the case of an ip redirect.
3701          */
3702         rt6_exceptions_clean_tohost(rt, gateway);
3703
3704         return 0;
3705 }
3706
3707 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3708 {
3709         fib6_clean_all(net, fib6_clean_tohost, gateway);
3710 }
3711
3712 struct arg_netdev_event {
3713         const struct net_device *dev;
3714         union {
3715                 unsigned int nh_flags;
3716                 unsigned long event;
3717         };
3718 };
3719
3720 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3721 {
3722         struct fib6_info *iter;
3723         struct fib6_node *fn;
3724
3725         fn = rcu_dereference_protected(rt->fib6_node,
3726                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3727         iter = rcu_dereference_protected(fn->leaf,
3728                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3729         while (iter) {
3730                 if (iter->fib6_metric == rt->fib6_metric &&
3731                     rt6_qualify_for_ecmp(iter))
3732                         return iter;
3733                 iter = rcu_dereference_protected(iter->rt6_next,
3734                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3735         }
3736
3737         return NULL;
3738 }
3739
3740 static bool rt6_is_dead(const struct fib6_info *rt)
3741 {
3742         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3743             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3744              fib6_ignore_linkdown(rt)))
3745                 return true;
3746
3747         return false;
3748 }
3749
3750 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3751 {
3752         struct fib6_info *iter;
3753         int total = 0;
3754
3755         if (!rt6_is_dead(rt))
3756                 total += rt->fib6_nh.nh_weight;
3757
3758         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3759                 if (!rt6_is_dead(iter))
3760                         total += iter->fib6_nh.nh_weight;
3761         }
3762
3763         return total;
3764 }
3765
3766 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3767 {
3768         int upper_bound = -1;
3769
3770         if (!rt6_is_dead(rt)) {
3771                 *weight += rt->fib6_nh.nh_weight;
3772                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3773                                                     total) - 1;
3774         }
3775         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3776 }
3777
3778 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3779 {
3780         struct fib6_info *iter;
3781         int weight = 0;
3782
3783         rt6_upper_bound_set(rt, &weight, total);
3784
3785         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3786                 rt6_upper_bound_set(iter, &weight, total);
3787 }
3788
3789 void rt6_multipath_rebalance(struct fib6_info *rt)
3790 {
3791         struct fib6_info *first;
3792         int total;
3793
3794         /* In case the entire multipath route was marked for flushing,
3795          * then there is no need to rebalance upon the removal of every
3796          * sibling route.
3797          */
3798         if (!rt->fib6_nsiblings || rt->should_flush)
3799                 return;
3800
3801         /* During lookup routes are evaluated in order, so we need to
3802          * make sure upper bounds are assigned from the first sibling
3803          * onwards.
3804          */
3805         first = rt6_multipath_first_sibling(rt);
3806         if (WARN_ON_ONCE(!first))
3807                 return;
3808
3809         total = rt6_multipath_total_weight(first);
3810         rt6_multipath_upper_bound_set(first, total);
3811 }
3812
3813 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3814 {
3815         const struct arg_netdev_event *arg = p_arg;
3816         struct net *net = dev_net(arg->dev);
3817
3818         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3819                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3820                 fib6_update_sernum_upto_root(net, rt);
3821                 rt6_multipath_rebalance(rt);
3822         }
3823
3824         return 0;
3825 }
3826
3827 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3828 {
3829         struct arg_netdev_event arg = {
3830                 .dev = dev,
3831                 {
3832                         .nh_flags = nh_flags,
3833                 },
3834         };
3835
3836         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3837                 arg.nh_flags |= RTNH_F_LINKDOWN;
3838
3839         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3840 }
3841
3842 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3843                                    const struct net_device *dev)
3844 {
3845         struct fib6_info *iter;
3846
3847         if (rt->fib6_nh.nh_dev == dev)
3848                 return true;
3849         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3850                 if (iter->fib6_nh.nh_dev == dev)
3851                         return true;
3852
3853         return false;
3854 }
3855
3856 static void rt6_multipath_flush(struct fib6_info *rt)
3857 {
3858         struct fib6_info *iter;
3859
3860         rt->should_flush = 1;
3861         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3862                 iter->should_flush = 1;
3863 }
3864
3865 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3866                                              const struct net_device *down_dev)
3867 {
3868         struct fib6_info *iter;
3869         unsigned int dead = 0;
3870
3871         if (rt->fib6_nh.nh_dev == down_dev ||
3872             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3873                 dead++;
3874         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3875                 if (iter->fib6_nh.nh_dev == down_dev ||
3876                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3877                         dead++;
3878
3879         return dead;
3880 }
3881
3882 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3883                                        const struct net_device *dev,
3884                                        unsigned int nh_flags)
3885 {
3886         struct fib6_info *iter;
3887
3888         if (rt->fib6_nh.nh_dev == dev)
3889                 rt->fib6_nh.nh_flags |= nh_flags;
3890         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3891                 if (iter->fib6_nh.nh_dev == dev)
3892                         iter->fib6_nh.nh_flags |= nh_flags;
3893 }
3894
3895 /* called with write lock held for table with rt */
3896 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3897 {
3898         const struct arg_netdev_event *arg = p_arg;
3899         const struct net_device *dev = arg->dev;
3900         struct net *net = dev_net(dev);
3901
3902         if (rt == net->ipv6.fib6_null_entry)
3903                 return 0;
3904
3905         switch (arg->event) {
3906         case NETDEV_UNREGISTER:
3907                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3908         case NETDEV_DOWN:
3909                 if (rt->should_flush)
3910                         return -1;
3911                 if (!rt->fib6_nsiblings)
3912                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3913                 if (rt6_multipath_uses_dev(rt, dev)) {
3914                         unsigned int count;
3915
3916                         count = rt6_multipath_dead_count(rt, dev);
3917                         if (rt->fib6_nsiblings + 1 == count) {
3918                                 rt6_multipath_flush(rt);
3919                                 return -1;
3920                         }
3921                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3922                                                    RTNH_F_LINKDOWN);
3923                         fib6_update_sernum(net, rt);
3924                         rt6_multipath_rebalance(rt);
3925                 }
3926                 return -2;
3927         case NETDEV_CHANGE:
3928                 if (rt->fib6_nh.nh_dev != dev ||
3929                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3930                         break;
3931                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3932                 rt6_multipath_rebalance(rt);
3933                 break;
3934         }
3935
3936         return 0;
3937 }
3938
3939 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3940 {
3941         struct arg_netdev_event arg = {
3942                 .dev = dev,
3943                 {
3944                         .event = event,
3945                 },
3946         };
3947
3948         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3949 }
3950
3951 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3952 {
3953         rt6_sync_down_dev(dev, event);
3954         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3955         neigh_ifdown(&nd_tbl, dev);
3956 }
3957
3958 struct rt6_mtu_change_arg {
3959         struct net_device *dev;
3960         unsigned int mtu;
3961 };
3962
3963 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
3964 {
3965         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3966         struct inet6_dev *idev;
3967
3968         /* In IPv6 pmtu discovery is not optional,
3969            so that RTAX_MTU lock cannot disable it.
3970            We still use this lock to block changes
3971            caused by addrconf/ndisc.
3972         */
3973
3974         idev = __in6_dev_get(arg->dev);
3975         if (!idev)
3976                 return 0;
3977
3978         /* For administrative MTU increase, there is no way to discover
3979            IPv6 PMTU increase, so PMTU increase should be updated here.
3980            Since RFC 1981 doesn't include administrative MTU increase
3981            update PMTU increase is a MUST. (i.e. jumbo frame)
3982          */
3983         if (rt->fib6_nh.nh_dev == arg->dev &&
3984             !fib6_metric_locked(rt, RTAX_MTU)) {
3985                 u32 mtu = rt->fib6_pmtu;
3986
3987                 if (mtu >= arg->mtu ||
3988                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
3989                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
3990
3991                 spin_lock_bh(&rt6_exception_lock);
3992                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3993                 spin_unlock_bh(&rt6_exception_lock);
3994         }
3995         return 0;
3996 }
3997
3998 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3999 {
4000         struct rt6_mtu_change_arg arg = {
4001                 .dev = dev,
4002                 .mtu = mtu,
4003         };
4004
4005         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4006 }
4007
4008 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4009         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4010         [RTA_OIF]               = { .type = NLA_U32 },
4011         [RTA_IIF]               = { .type = NLA_U32 },
4012         [RTA_PRIORITY]          = { .type = NLA_U32 },
4013         [RTA_METRICS]           = { .type = NLA_NESTED },
4014         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4015         [RTA_PREF]              = { .type = NLA_U8 },
4016         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4017         [RTA_ENCAP]             = { .type = NLA_NESTED },
4018         [RTA_EXPIRES]           = { .type = NLA_U32 },
4019         [RTA_UID]               = { .type = NLA_U32 },
4020         [RTA_MARK]              = { .type = NLA_U32 },
4021 };
4022
4023 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4024                               struct fib6_config *cfg,
4025                               struct netlink_ext_ack *extack)
4026 {
4027         struct rtmsg *rtm;
4028         struct nlattr *tb[RTA_MAX+1];
4029         unsigned int pref;
4030         int err;
4031
4032         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4033                           NULL);
4034         if (err < 0)
4035                 goto errout;
4036
4037         err = -EINVAL;
4038         rtm = nlmsg_data(nlh);
4039         memset(cfg, 0, sizeof(*cfg));
4040
4041         cfg->fc_table = rtm->rtm_table;
4042         cfg->fc_dst_len = rtm->rtm_dst_len;
4043         cfg->fc_src_len = rtm->rtm_src_len;
4044         cfg->fc_flags = RTF_UP;
4045         cfg->fc_protocol = rtm->rtm_protocol;
4046         cfg->fc_type = rtm->rtm_type;
4047
4048         if (rtm->rtm_type == RTN_UNREACHABLE ||
4049             rtm->rtm_type == RTN_BLACKHOLE ||
4050             rtm->rtm_type == RTN_PROHIBIT ||
4051             rtm->rtm_type == RTN_THROW)
4052                 cfg->fc_flags |= RTF_REJECT;
4053
4054         if (rtm->rtm_type == RTN_LOCAL)
4055                 cfg->fc_flags |= RTF_LOCAL;
4056
4057         if (rtm->rtm_flags & RTM_F_CLONED)
4058                 cfg->fc_flags |= RTF_CACHE;
4059
4060         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4061
4062         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4063         cfg->fc_nlinfo.nlh = nlh;
4064         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4065
4066         if (tb[RTA_GATEWAY]) {
4067                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4068                 cfg->fc_flags |= RTF_GATEWAY;
4069         }
4070
4071         if (tb[RTA_DST]) {
4072                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4073
4074                 if (nla_len(tb[RTA_DST]) < plen)
4075                         goto errout;
4076
4077                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4078         }
4079
4080         if (tb[RTA_SRC]) {
4081                 int plen = (rtm->rtm_src_len + 7) >> 3;
4082
4083                 if (nla_len(tb[RTA_SRC]) < plen)
4084                         goto errout;
4085
4086                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4087         }
4088
4089         if (tb[RTA_PREFSRC])
4090                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4091
4092         if (tb[RTA_OIF])
4093                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4094
4095         if (tb[RTA_PRIORITY])
4096                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4097
4098         if (tb[RTA_METRICS]) {
4099                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4100                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4101         }
4102
4103         if (tb[RTA_TABLE])
4104                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4105
4106         if (tb[RTA_MULTIPATH]) {
4107                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4108                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4109
4110                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4111                                                      cfg->fc_mp_len, extack);
4112                 if (err < 0)
4113                         goto errout;
4114         }
4115
4116         if (tb[RTA_PREF]) {
4117                 pref = nla_get_u8(tb[RTA_PREF]);
4118                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4119                     pref != ICMPV6_ROUTER_PREF_HIGH)
4120                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4121                 cfg->fc_flags |= RTF_PREF(pref);
4122         }
4123
4124         if (tb[RTA_ENCAP])
4125                 cfg->fc_encap = tb[RTA_ENCAP];
4126
4127         if (tb[RTA_ENCAP_TYPE]) {
4128                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4129
4130                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4131                 if (err < 0)
4132                         goto errout;
4133         }
4134
4135         if (tb[RTA_EXPIRES]) {
4136                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4137
4138                 if (addrconf_finite_timeout(timeout)) {
4139                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4140                         cfg->fc_flags |= RTF_EXPIRES;
4141                 }
4142         }
4143
4144         err = 0;
4145 errout:
4146         return err;
4147 }
4148
4149 struct rt6_nh {
4150         struct fib6_info *fib6_info;
4151         struct fib6_config r_cfg;
4152         struct list_head next;
4153 };
4154
4155 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4156 {
4157         struct rt6_nh *nh;
4158
4159         list_for_each_entry(nh, rt6_nh_list, next) {
4160                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4161                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4162                         nh->r_cfg.fc_ifindex);
4163         }
4164 }
4165
4166 static int ip6_route_info_append(struct net *net,
4167                                  struct list_head *rt6_nh_list,
4168                                  struct fib6_info *rt,
4169                                  struct fib6_config *r_cfg)
4170 {
4171         struct rt6_nh *nh;
4172         int err = -EEXIST;
4173
4174         list_for_each_entry(nh, rt6_nh_list, next) {
4175                 /* check if fib6_info already exists */
4176                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4177                         return err;
4178         }
4179
4180         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4181         if (!nh)
4182                 return -ENOMEM;
4183         nh->fib6_info = rt;
4184         err = ip6_convert_metrics(net, rt, r_cfg);
4185         if (err) {
4186                 kfree(nh);
4187                 return err;
4188         }
4189         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4190         list_add_tail(&nh->next, rt6_nh_list);
4191
4192         return 0;
4193 }
4194
4195 static void ip6_route_mpath_notify(struct fib6_info *rt,
4196                                    struct fib6_info *rt_last,
4197                                    struct nl_info *info,
4198                                    __u16 nlflags)
4199 {
4200         /* if this is an APPEND route, then rt points to the first route
4201          * inserted and rt_last points to last route inserted. Userspace
4202          * wants a consistent dump of the route which starts at the first
4203          * nexthop. Since sibling routes are always added at the end of
4204          * the list, find the first sibling of the last route appended
4205          */
4206         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4207                 rt = list_first_entry(&rt_last->fib6_siblings,
4208                                       struct fib6_info,
4209                                       fib6_siblings);
4210         }
4211
4212         if (rt)
4213                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4214 }
4215
4216 static int ip6_route_multipath_add(struct fib6_config *cfg,
4217                                    struct netlink_ext_ack *extack)
4218 {
4219         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4220         struct nl_info *info = &cfg->fc_nlinfo;
4221         struct fib6_config r_cfg;
4222         struct rtnexthop *rtnh;
4223         struct fib6_info *rt;
4224         struct rt6_nh *err_nh;
4225         struct rt6_nh *nh, *nh_safe;
4226         __u16 nlflags;
4227         int remaining;
4228         int attrlen;
4229         int err = 1;
4230         int nhn = 0;
4231         int replace = (cfg->fc_nlinfo.nlh &&
4232                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4233         LIST_HEAD(rt6_nh_list);
4234
4235         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4236         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4237                 nlflags |= NLM_F_APPEND;
4238
4239         remaining = cfg->fc_mp_len;
4240         rtnh = (struct rtnexthop *)cfg->fc_mp;
4241
4242         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4243          * fib6_info structs per nexthop
4244          */
4245         while (rtnh_ok(rtnh, remaining)) {
4246                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4247                 if (rtnh->rtnh_ifindex)
4248                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4249
4250                 attrlen = rtnh_attrlen(rtnh);
4251                 if (attrlen > 0) {
4252                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4253
4254                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4255                         if (nla) {
4256                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4257                                 r_cfg.fc_flags |= RTF_GATEWAY;
4258                         }
4259                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4260                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4261                         if (nla)
4262                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4263                 }
4264
4265                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4266                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4267                 if (IS_ERR(rt)) {
4268                         err = PTR_ERR(rt);
4269                         rt = NULL;
4270                         goto cleanup;
4271                 }
4272
4273                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4274
4275                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4276                                             rt, &r_cfg);
4277                 if (err) {
4278                         fib6_info_release(rt);
4279                         goto cleanup;
4280                 }
4281
4282                 rtnh = rtnh_next(rtnh, &remaining);
4283         }
4284
4285         /* for add and replace send one notification with all nexthops.
4286          * Skip the notification in fib6_add_rt2node and send one with
4287          * the full route when done
4288          */
4289         info->skip_notify = 1;
4290
4291         err_nh = NULL;
4292         list_for_each_entry(nh, &rt6_nh_list, next) {
4293                 rt_last = nh->fib6_info;
4294                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4295                 fib6_info_release(nh->fib6_info);
4296
4297                 /* save reference to first route for notification */
4298                 if (!rt_notif && !err)
4299                         rt_notif = nh->fib6_info;
4300
4301                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4302                 nh->fib6_info = NULL;
4303                 if (err) {
4304                         if (replace && nhn)
4305                                 ip6_print_replace_route_err(&rt6_nh_list);
4306                         err_nh = nh;
4307                         goto add_errout;
4308                 }
4309
4310                 /* Because each route is added like a single route we remove
4311                  * these flags after the first nexthop: if there is a collision,
4312                  * we have already failed to add the first nexthop:
4313                  * fib6_add_rt2node() has rejected it; when replacing, old
4314                  * nexthops have been replaced by first new, the rest should
4315                  * be added to it.
4316                  */
4317                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4318                                                      NLM_F_REPLACE);
4319                 nhn++;
4320         }
4321
4322         /* success ... tell user about new route */
4323         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4324         goto cleanup;
4325
4326 add_errout:
4327         /* send notification for routes that were added so that
4328          * the delete notifications sent by ip6_route_del are
4329          * coherent
4330          */
4331         if (rt_notif)
4332                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4333
4334         /* Delete routes that were already added */
4335         list_for_each_entry(nh, &rt6_nh_list, next) {
4336                 if (err_nh == nh)
4337                         break;
4338                 ip6_route_del(&nh->r_cfg, extack);
4339         }
4340
4341 cleanup:
4342         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4343                 if (nh->fib6_info)
4344                         fib6_info_release(nh->fib6_info);
4345                 list_del(&nh->next);
4346                 kfree(nh);
4347         }
4348
4349         return err;
4350 }
4351
4352 static int ip6_route_multipath_del(struct fib6_config *cfg,
4353                                    struct netlink_ext_ack *extack)
4354 {
4355         struct fib6_config r_cfg;
4356         struct rtnexthop *rtnh;
4357         int remaining;
4358         int attrlen;
4359         int err = 1, last_err = 0;
4360
4361         remaining = cfg->fc_mp_len;
4362         rtnh = (struct rtnexthop *)cfg->fc_mp;
4363
4364         /* Parse a Multipath Entry */
4365         while (rtnh_ok(rtnh, remaining)) {
4366                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4367                 if (rtnh->rtnh_ifindex)
4368                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4369
4370                 attrlen = rtnh_attrlen(rtnh);
4371                 if (attrlen > 0) {
4372                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4373
4374                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4375                         if (nla) {
4376                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4377                                 r_cfg.fc_flags |= RTF_GATEWAY;
4378                         }
4379                 }
4380                 err = ip6_route_del(&r_cfg, extack);
4381                 if (err)
4382                         last_err = err;
4383
4384                 rtnh = rtnh_next(rtnh, &remaining);
4385         }
4386
4387         return last_err;
4388 }
4389
4390 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4391                               struct netlink_ext_ack *extack)
4392 {
4393         struct fib6_config cfg;
4394         int err;
4395
4396         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4397         if (err < 0)
4398                 return err;
4399
4400         if (cfg.fc_mp)
4401                 return ip6_route_multipath_del(&cfg, extack);
4402         else {
4403                 cfg.fc_delete_all_nh = 1;
4404                 return ip6_route_del(&cfg, extack);
4405         }
4406 }
4407
4408 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4409                               struct netlink_ext_ack *extack)
4410 {
4411         struct fib6_config cfg;
4412         int err;
4413
4414         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4415         if (err < 0)
4416                 return err;
4417
4418         if (cfg.fc_mp)
4419                 return ip6_route_multipath_add(&cfg, extack);
4420         else
4421                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4422 }
4423
4424 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4425 {
4426         int nexthop_len = 0;
4427
4428         if (rt->fib6_nsiblings) {
4429                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4430                             + NLA_ALIGN(sizeof(struct rtnexthop))
4431                             + nla_total_size(16) /* RTA_GATEWAY */
4432                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4433
4434                 nexthop_len *= rt->fib6_nsiblings;
4435         }
4436
4437         return NLMSG_ALIGN(sizeof(struct rtmsg))
4438                + nla_total_size(16) /* RTA_SRC */
4439                + nla_total_size(16) /* RTA_DST */
4440                + nla_total_size(16) /* RTA_GATEWAY */
4441                + nla_total_size(16) /* RTA_PREFSRC */
4442                + nla_total_size(4) /* RTA_TABLE */
4443                + nla_total_size(4) /* RTA_IIF */
4444                + nla_total_size(4) /* RTA_OIF */
4445                + nla_total_size(4) /* RTA_PRIORITY */
4446                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4447                + nla_total_size(sizeof(struct rta_cacheinfo))
4448                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4449                + nla_total_size(1) /* RTA_PREF */
4450                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4451                + nexthop_len;
4452 }
4453
4454 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4455                             unsigned int *flags, bool skip_oif)
4456 {
4457         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4458                 *flags |= RTNH_F_DEAD;
4459
4460         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4461                 *flags |= RTNH_F_LINKDOWN;
4462
4463                 rcu_read_lock();
4464                 if (fib6_ignore_linkdown(rt))
4465                         *flags |= RTNH_F_DEAD;
4466                 rcu_read_unlock();
4467         }
4468
4469         if (rt->fib6_flags & RTF_GATEWAY) {
4470                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4471                         goto nla_put_failure;
4472         }
4473
4474         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4475         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4476                 *flags |= RTNH_F_OFFLOAD;
4477
4478         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4479         if (!skip_oif && rt->fib6_nh.nh_dev &&
4480             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4481                 goto nla_put_failure;
4482
4483         if (rt->fib6_nh.nh_lwtstate &&
4484             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4485                 goto nla_put_failure;
4486
4487         return 0;
4488
4489 nla_put_failure:
4490         return -EMSGSIZE;
4491 }
4492
4493 /* add multipath next hop */
4494 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4495 {
4496         const struct net_device *dev = rt->fib6_nh.nh_dev;
4497         struct rtnexthop *rtnh;
4498         unsigned int flags = 0;
4499
4500         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4501         if (!rtnh)
4502                 goto nla_put_failure;
4503
4504         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4505         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4506
4507         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4508                 goto nla_put_failure;
4509
4510         rtnh->rtnh_flags = flags;
4511
4512         /* length of rtnetlink header + attributes */
4513         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4514
4515         return 0;
4516
4517 nla_put_failure:
4518         return -EMSGSIZE;
4519 }
4520
4521 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4522                          struct fib6_info *rt, struct dst_entry *dst,
4523                          struct in6_addr *dest, struct in6_addr *src,
4524                          int iif, int type, u32 portid, u32 seq,
4525                          unsigned int flags)
4526 {
4527         struct rtmsg *rtm;
4528         struct nlmsghdr *nlh;
4529         long expires = 0;
4530         u32 *pmetrics;
4531         u32 table;
4532
4533         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4534         if (!nlh)
4535                 return -EMSGSIZE;
4536
4537         rtm = nlmsg_data(nlh);
4538         rtm->rtm_family = AF_INET6;
4539         rtm->rtm_dst_len = rt->fib6_dst.plen;
4540         rtm->rtm_src_len = rt->fib6_src.plen;
4541         rtm->rtm_tos = 0;
4542         if (rt->fib6_table)
4543                 table = rt->fib6_table->tb6_id;
4544         else
4545                 table = RT6_TABLE_UNSPEC;
4546         rtm->rtm_table = table;
4547         if (nla_put_u32(skb, RTA_TABLE, table))
4548                 goto nla_put_failure;
4549
4550         rtm->rtm_type = rt->fib6_type;
4551         rtm->rtm_flags = 0;
4552         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4553         rtm->rtm_protocol = rt->fib6_protocol;
4554
4555         if (rt->fib6_flags & RTF_CACHE)
4556                 rtm->rtm_flags |= RTM_F_CLONED;
4557
4558         if (dest) {
4559                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4560                         goto nla_put_failure;
4561                 rtm->rtm_dst_len = 128;
4562         } else if (rtm->rtm_dst_len)
4563                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4564                         goto nla_put_failure;
4565 #ifdef CONFIG_IPV6_SUBTREES
4566         if (src) {
4567                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4568                         goto nla_put_failure;
4569                 rtm->rtm_src_len = 128;
4570         } else if (rtm->rtm_src_len &&
4571                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4572                 goto nla_put_failure;
4573 #endif
4574         if (iif) {
4575 #ifdef CONFIG_IPV6_MROUTE
4576                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4577                         int err = ip6mr_get_route(net, skb, rtm, portid);
4578
4579                         if (err == 0)
4580                                 return 0;
4581                         if (err < 0)
4582                                 goto nla_put_failure;
4583                 } else
4584 #endif
4585                         if (nla_put_u32(skb, RTA_IIF, iif))
4586                                 goto nla_put_failure;
4587         } else if (dest) {
4588                 struct in6_addr saddr_buf;
4589                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4590                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4591                         goto nla_put_failure;
4592         }
4593
4594         if (rt->fib6_prefsrc.plen) {
4595                 struct in6_addr saddr_buf;
4596                 saddr_buf = rt->fib6_prefsrc.addr;
4597                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4598                         goto nla_put_failure;
4599         }
4600
4601         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4602         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4603                 goto nla_put_failure;
4604
4605         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4606                 goto nla_put_failure;
4607
4608         /* For multipath routes, walk the siblings list and add
4609          * each as a nexthop within RTA_MULTIPATH.
4610          */
4611         if (rt->fib6_nsiblings) {
4612                 struct fib6_info *sibling, *next_sibling;
4613                 struct nlattr *mp;
4614
4615                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4616                 if (!mp)
4617                         goto nla_put_failure;
4618
4619                 if (rt6_add_nexthop(skb, rt) < 0)
4620                         goto nla_put_failure;
4621
4622                 list_for_each_entry_safe(sibling, next_sibling,
4623                                          &rt->fib6_siblings, fib6_siblings) {
4624                         if (rt6_add_nexthop(skb, sibling) < 0)
4625                                 goto nla_put_failure;
4626                 }
4627
4628                 nla_nest_end(skb, mp);
4629         } else {
4630                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4631                         goto nla_put_failure;
4632         }
4633
4634         if (rt->fib6_flags & RTF_EXPIRES) {
4635                 expires = dst ? dst->expires : rt->expires;
4636                 expires -= jiffies;
4637         }
4638
4639         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4640                 goto nla_put_failure;
4641
4642         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4643                 goto nla_put_failure;
4644
4645
4646         nlmsg_end(skb, nlh);
4647         return 0;
4648
4649 nla_put_failure:
4650         nlmsg_cancel(skb, nlh);
4651         return -EMSGSIZE;
4652 }
4653
4654 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4655 {
4656         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4657         struct net *net = arg->net;
4658
4659         if (rt == net->ipv6.fib6_null_entry)
4660                 return 0;
4661
4662         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4663                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4664
4665                 /* user wants prefix routes only */
4666                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4667                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4668                         /* success since this is not a prefix route */
4669                         return 1;
4670                 }
4671         }
4672
4673         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4674                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4675                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4676 }
4677
4678 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4679                               struct netlink_ext_ack *extack)
4680 {
4681         struct net *net = sock_net(in_skb->sk);
4682         struct nlattr *tb[RTA_MAX+1];
4683         int err, iif = 0, oif = 0;
4684         struct dst_entry *dst;
4685         struct rt6_info *rt;
4686         struct sk_buff *skb;
4687         struct rtmsg *rtm;
4688         struct flowi6 fl6;
4689         bool fibmatch;
4690
4691         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4692                           extack);
4693         if (err < 0)
4694                 goto errout;
4695
4696         err = -EINVAL;
4697         memset(&fl6, 0, sizeof(fl6));
4698         rtm = nlmsg_data(nlh);
4699         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4700         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4701
4702         if (tb[RTA_SRC]) {
4703                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4704                         goto errout;
4705
4706                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4707         }
4708
4709         if (tb[RTA_DST]) {
4710                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4711                         goto errout;
4712
4713                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4714         }
4715
4716         if (tb[RTA_IIF])
4717                 iif = nla_get_u32(tb[RTA_IIF]);
4718
4719         if (tb[RTA_OIF])
4720                 oif = nla_get_u32(tb[RTA_OIF]);
4721
4722         if (tb[RTA_MARK])
4723                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4724
4725         if (tb[RTA_UID])
4726                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4727                                            nla_get_u32(tb[RTA_UID]));
4728         else
4729                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4730
4731         if (iif) {
4732                 struct net_device *dev;
4733                 int flags = 0;
4734
4735                 rcu_read_lock();
4736
4737                 dev = dev_get_by_index_rcu(net, iif);
4738                 if (!dev) {
4739                         rcu_read_unlock();
4740                         err = -ENODEV;
4741                         goto errout;
4742                 }
4743
4744                 fl6.flowi6_iif = iif;
4745
4746                 if (!ipv6_addr_any(&fl6.saddr))
4747                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4748
4749                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4750
4751                 rcu_read_unlock();
4752         } else {
4753                 fl6.flowi6_oif = oif;
4754
4755                 dst = ip6_route_output(net, NULL, &fl6);
4756         }
4757
4758
4759         rt = container_of(dst, struct rt6_info, dst);
4760         if (rt->dst.error) {
4761                 err = rt->dst.error;
4762                 ip6_rt_put(rt);
4763                 goto errout;
4764         }
4765
4766         if (rt == net->ipv6.ip6_null_entry) {
4767                 err = rt->dst.error;
4768                 ip6_rt_put(rt);
4769                 goto errout;
4770         }
4771
4772         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4773         if (!skb) {
4774                 ip6_rt_put(rt);
4775                 err = -ENOBUFS;
4776                 goto errout;
4777         }
4778
4779         skb_dst_set(skb, &rt->dst);
4780         if (fibmatch)
4781                 err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
4782                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4783                                     nlh->nlmsg_seq, 0);
4784         else
4785                 err = rt6_fill_node(net, skb, rt->from, dst,
4786                                     &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
4787                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4788                                     0);
4789         if (err < 0) {
4790                 kfree_skb(skb);
4791                 goto errout;
4792         }
4793
4794         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4795 errout:
4796         return err;
4797 }
4798
4799 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4800                      unsigned int nlm_flags)
4801 {
4802         struct sk_buff *skb;
4803         struct net *net = info->nl_net;
4804         u32 seq;
4805         int err;
4806
4807         err = -ENOBUFS;
4808         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4809
4810         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4811         if (!skb)
4812                 goto errout;
4813
4814         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4815                             event, info->portid, seq, nlm_flags);
4816         if (err < 0) {
4817                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4818                 WARN_ON(err == -EMSGSIZE);
4819                 kfree_skb(skb);
4820                 goto errout;
4821         }
4822         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4823                     info->nlh, gfp_any());
4824         return;
4825 errout:
4826         if (err < 0)
4827                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4828 }
4829
4830 static int ip6_route_dev_notify(struct notifier_block *this,
4831                                 unsigned long event, void *ptr)
4832 {
4833         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4834         struct net *net = dev_net(dev);
4835
4836         if (!(dev->flags & IFF_LOOPBACK))
4837                 return NOTIFY_OK;
4838
4839         if (event == NETDEV_REGISTER) {
4840                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4841                 net->ipv6.ip6_null_entry->dst.dev = dev;
4842                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4843 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4844                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4845                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4846                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4847                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4848 #endif
4849          } else if (event == NETDEV_UNREGISTER &&
4850                     dev->reg_state != NETREG_UNREGISTERED) {
4851                 /* NETDEV_UNREGISTER could be fired for multiple times by
4852                  * netdev_wait_allrefs(). Make sure we only call this once.
4853                  */
4854                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4855 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4856                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4857                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4858 #endif
4859         }
4860
4861         return NOTIFY_OK;
4862 }
4863
4864 /*
4865  *      /proc
4866  */
4867
4868 #ifdef CONFIG_PROC_FS
4869
4870 static const struct file_operations ipv6_route_proc_fops = {
4871         .open           = ipv6_route_open,
4872         .read           = seq_read,
4873         .llseek         = seq_lseek,
4874         .release        = seq_release_net,
4875 };
4876
4877 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4878 {
4879         struct net *net = (struct net *)seq->private;
4880         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4881                    net->ipv6.rt6_stats->fib_nodes,
4882                    net->ipv6.rt6_stats->fib_route_nodes,
4883                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4884                    net->ipv6.rt6_stats->fib_rt_entries,
4885                    net->ipv6.rt6_stats->fib_rt_cache,
4886                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4887                    net->ipv6.rt6_stats->fib_discarded_routes);
4888
4889         return 0;
4890 }
4891
4892 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4893 {
4894         return single_open_net(inode, file, rt6_stats_seq_show);
4895 }
4896
4897 static const struct file_operations rt6_stats_seq_fops = {
4898         .open    = rt6_stats_seq_open,
4899         .read    = seq_read,
4900         .llseek  = seq_lseek,
4901         .release = single_release_net,
4902 };
4903 #endif  /* CONFIG_PROC_FS */
4904
4905 #ifdef CONFIG_SYSCTL
4906
4907 static
4908 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4909                               void __user *buffer, size_t *lenp, loff_t *ppos)
4910 {
4911         struct net *net;
4912         int delay;
4913         if (!write)
4914                 return -EINVAL;
4915
4916         net = (struct net *)ctl->extra1;
4917         delay = net->ipv6.sysctl.flush_delay;
4918         proc_dointvec(ctl, write, buffer, lenp, ppos);
4919         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4920         return 0;
4921 }
4922
4923 struct ctl_table ipv6_route_table_template[] = {
4924         {
4925                 .procname       =       "flush",
4926                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4927                 .maxlen         =       sizeof(int),
4928                 .mode           =       0200,
4929                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4930         },
4931         {
4932                 .procname       =       "gc_thresh",
4933                 .data           =       &ip6_dst_ops_template.gc_thresh,
4934                 .maxlen         =       sizeof(int),
4935                 .mode           =       0644,
4936                 .proc_handler   =       proc_dointvec,
4937         },
4938         {
4939                 .procname       =       "max_size",
4940                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4941                 .maxlen         =       sizeof(int),
4942                 .mode           =       0644,
4943                 .proc_handler   =       proc_dointvec,
4944         },
4945         {
4946                 .procname       =       "gc_min_interval",
4947                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4948                 .maxlen         =       sizeof(int),
4949                 .mode           =       0644,
4950                 .proc_handler   =       proc_dointvec_jiffies,
4951         },
4952         {
4953                 .procname       =       "gc_timeout",
4954                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4955                 .maxlen         =       sizeof(int),
4956                 .mode           =       0644,
4957                 .proc_handler   =       proc_dointvec_jiffies,
4958         },
4959         {
4960                 .procname       =       "gc_interval",
4961                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4962                 .maxlen         =       sizeof(int),
4963                 .mode           =       0644,
4964                 .proc_handler   =       proc_dointvec_jiffies,
4965         },
4966         {
4967                 .procname       =       "gc_elasticity",
4968                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4969                 .maxlen         =       sizeof(int),
4970                 .mode           =       0644,
4971                 .proc_handler   =       proc_dointvec,
4972         },
4973         {
4974                 .procname       =       "mtu_expires",
4975                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4976                 .maxlen         =       sizeof(int),
4977                 .mode           =       0644,
4978                 .proc_handler   =       proc_dointvec_jiffies,
4979         },
4980         {
4981                 .procname       =       "min_adv_mss",
4982                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4983                 .maxlen         =       sizeof(int),
4984                 .mode           =       0644,
4985                 .proc_handler   =       proc_dointvec,
4986         },
4987         {
4988                 .procname       =       "gc_min_interval_ms",
4989                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4990                 .maxlen         =       sizeof(int),
4991                 .mode           =       0644,
4992                 .proc_handler   =       proc_dointvec_ms_jiffies,
4993         },
4994         { }
4995 };
4996
4997 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4998 {
4999         struct ctl_table *table;
5000
5001         table = kmemdup(ipv6_route_table_template,
5002                         sizeof(ipv6_route_table_template),
5003                         GFP_KERNEL);
5004
5005         if (table) {
5006                 table[0].data = &net->ipv6.sysctl.flush_delay;
5007                 table[0].extra1 = net;
5008                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5009                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5010                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5011                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5012                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5013                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5014                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5015                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5016                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5017
5018                 /* Don't export sysctls to unprivileged users */
5019                 if (net->user_ns != &init_user_ns)
5020                         table[0].procname = NULL;
5021         }
5022
5023         return table;
5024 }
5025 #endif
5026
5027 static int __net_init ip6_route_net_init(struct net *net)
5028 {
5029         int ret = -ENOMEM;
5030
5031         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5032                sizeof(net->ipv6.ip6_dst_ops));
5033
5034         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5035                 goto out_ip6_dst_ops;
5036
5037         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5038                                             sizeof(*net->ipv6.fib6_null_entry),
5039                                             GFP_KERNEL);
5040         if (!net->ipv6.fib6_null_entry)
5041                 goto out_ip6_dst_entries;
5042
5043         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5044                                            sizeof(*net->ipv6.ip6_null_entry),
5045                                            GFP_KERNEL);
5046         if (!net->ipv6.ip6_null_entry)
5047                 goto out_fib6_null_entry;
5048         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5049         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5050                          ip6_template_metrics, true);
5051
5052 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5053         net->ipv6.fib6_has_custom_rules = false;
5054         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5055                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5056                                                GFP_KERNEL);
5057         if (!net->ipv6.ip6_prohibit_entry)
5058                 goto out_ip6_null_entry;
5059         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5060         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5061                          ip6_template_metrics, true);
5062
5063         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5064                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5065                                                GFP_KERNEL);
5066         if (!net->ipv6.ip6_blk_hole_entry)
5067                 goto out_ip6_prohibit_entry;
5068         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5069         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5070                          ip6_template_metrics, true);
5071 #endif
5072
5073         net->ipv6.sysctl.flush_delay = 0;
5074         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5075         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5076         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5077         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5078         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5079         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5080         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5081
5082         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5083
5084         ret = 0;
5085 out:
5086         return ret;
5087
5088 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5089 out_ip6_prohibit_entry:
5090         kfree(net->ipv6.ip6_prohibit_entry);
5091 out_ip6_null_entry:
5092         kfree(net->ipv6.ip6_null_entry);
5093 #endif
5094 out_fib6_null_entry:
5095         kfree(net->ipv6.fib6_null_entry);
5096 out_ip6_dst_entries:
5097         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5098 out_ip6_dst_ops:
5099         goto out;
5100 }
5101
5102 static void __net_exit ip6_route_net_exit(struct net *net)
5103 {
5104         kfree(net->ipv6.fib6_null_entry);
5105         kfree(net->ipv6.ip6_null_entry);
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107         kfree(net->ipv6.ip6_prohibit_entry);
5108         kfree(net->ipv6.ip6_blk_hole_entry);
5109 #endif
5110         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5111 }
5112
5113 static int __net_init ip6_route_net_init_late(struct net *net)
5114 {
5115 #ifdef CONFIG_PROC_FS
5116         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5117         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5118 #endif
5119         return 0;
5120 }
5121
5122 static void __net_exit ip6_route_net_exit_late(struct net *net)
5123 {
5124 #ifdef CONFIG_PROC_FS
5125         remove_proc_entry("ipv6_route", net->proc_net);
5126         remove_proc_entry("rt6_stats", net->proc_net);
5127 #endif
5128 }
5129
5130 static struct pernet_operations ip6_route_net_ops = {
5131         .init = ip6_route_net_init,
5132         .exit = ip6_route_net_exit,
5133 };
5134
5135 static int __net_init ipv6_inetpeer_init(struct net *net)
5136 {
5137         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5138
5139         if (!bp)
5140                 return -ENOMEM;
5141         inet_peer_base_init(bp);
5142         net->ipv6.peers = bp;
5143         return 0;
5144 }
5145
5146 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5147 {
5148         struct inet_peer_base *bp = net->ipv6.peers;
5149
5150         net->ipv6.peers = NULL;
5151         inetpeer_invalidate_tree(bp);
5152         kfree(bp);
5153 }
5154
5155 static struct pernet_operations ipv6_inetpeer_ops = {
5156         .init   =       ipv6_inetpeer_init,
5157         .exit   =       ipv6_inetpeer_exit,
5158 };
5159
5160 static struct pernet_operations ip6_route_net_late_ops = {
5161         .init = ip6_route_net_init_late,
5162         .exit = ip6_route_net_exit_late,
5163 };
5164
5165 static struct notifier_block ip6_route_dev_notifier = {
5166         .notifier_call = ip6_route_dev_notify,
5167         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5168 };
5169
5170 void __init ip6_route_init_special_entries(void)
5171 {
5172         /* Registering of the loopback is done before this portion of code,
5173          * the loopback reference in rt6_info will not be taken, do it
5174          * manually for init_net */
5175         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5176         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5177         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5178   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5179         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5180         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5181         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5182         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5183   #endif
5184 }
5185
5186 int __init ip6_route_init(void)
5187 {
5188         int ret;
5189         int cpu;
5190
5191         ret = -ENOMEM;
5192         ip6_dst_ops_template.kmem_cachep =
5193                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5194                                   SLAB_HWCACHE_ALIGN, NULL);
5195         if (!ip6_dst_ops_template.kmem_cachep)
5196                 goto out;
5197
5198         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5199         if (ret)
5200                 goto out_kmem_cache;
5201
5202         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5203         if (ret)
5204                 goto out_dst_entries;
5205
5206         ret = register_pernet_subsys(&ip6_route_net_ops);
5207         if (ret)
5208                 goto out_register_inetpeer;
5209
5210         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5211
5212         ret = fib6_init();
5213         if (ret)
5214                 goto out_register_subsys;
5215
5216         ret = xfrm6_init();
5217         if (ret)
5218                 goto out_fib6_init;
5219
5220         ret = fib6_rules_init();
5221         if (ret)
5222                 goto xfrm6_init;
5223
5224         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5225         if (ret)
5226                 goto fib6_rules_init;
5227
5228         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5229                                    inet6_rtm_newroute, NULL, 0);
5230         if (ret < 0)
5231                 goto out_register_late_subsys;
5232
5233         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5234                                    inet6_rtm_delroute, NULL, 0);
5235         if (ret < 0)
5236                 goto out_register_late_subsys;
5237
5238         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5239                                    inet6_rtm_getroute, NULL,
5240                                    RTNL_FLAG_DOIT_UNLOCKED);
5241         if (ret < 0)
5242                 goto out_register_late_subsys;
5243
5244         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5245         if (ret)
5246                 goto out_register_late_subsys;
5247
5248         for_each_possible_cpu(cpu) {
5249                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5250
5251                 INIT_LIST_HEAD(&ul->head);
5252                 spin_lock_init(&ul->lock);
5253         }
5254
5255 out:
5256         return ret;
5257
5258 out_register_late_subsys:
5259         rtnl_unregister_all(PF_INET6);
5260         unregister_pernet_subsys(&ip6_route_net_late_ops);
5261 fib6_rules_init:
5262         fib6_rules_cleanup();
5263 xfrm6_init:
5264         xfrm6_fini();
5265 out_fib6_init:
5266         fib6_gc_cleanup();
5267 out_register_subsys:
5268         unregister_pernet_subsys(&ip6_route_net_ops);
5269 out_register_inetpeer:
5270         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5271 out_dst_entries:
5272         dst_entries_destroy(&ip6_dst_blackhole_ops);
5273 out_kmem_cache:
5274         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5275         goto out;
5276 }
5277
5278 void ip6_route_cleanup(void)
5279 {
5280         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5281         unregister_pernet_subsys(&ip6_route_net_late_ops);
5282         fib6_rules_cleanup();
5283         xfrm6_fini();
5284         fib6_gc_cleanup();
5285         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5286         unregister_pernet_subsys(&ip6_route_net_ops);
5287         dst_entries_destroy(&ip6_dst_blackhole_ops);
5288         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5289 }