]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
0d861bd076731375dce0156d4949bec17dbcf27c
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct rt6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct rt6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct rt6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct rt6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct rt6_info fib6_null_entry_template = {
287         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .rt6i_protocol  = RTPROT_KERNEL,
289         .rt6i_metric    = ~(u32)0,
290         .rt6i_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305         .rt6i_protocol  = RTPROT_KERNEL,
306         .rt6i_metric    = ~(u32) 0,
307         .rt6i_ref       = ATOMIC_INIT(1),
308         .fib6_type      = RTN_UNREACHABLE,
309 };
310
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
312
313 static const struct rt6_info ip6_prohibit_entry_template = {
314         .dst = {
315                 .__refcnt       = ATOMIC_INIT(1),
316                 .__use          = 1,
317                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
318                 .error          = -EACCES,
319                 .input          = ip6_pkt_prohibit,
320                 .output         = ip6_pkt_prohibit_out,
321         },
322         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
323         .rt6i_protocol  = RTPROT_KERNEL,
324         .rt6i_metric    = ~(u32) 0,
325         .rt6i_ref       = ATOMIC_INIT(1),
326         .fib6_type      = RTN_PROHIBIT,
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339         .rt6i_protocol  = RTPROT_KERNEL,
340         .rt6i_metric    = ~(u32) 0,
341         .rt6i_ref       = ATOMIC_INIT(1),
342         .fib6_type      = RTN_BLACKHOLE,
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
358                                int flags)
359 {
360         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
361                                         1, DST_OBSOLETE_FORCE_CHK, flags);
362
363         if (rt) {
364                 rt6_info_init(rt);
365                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
366         }
367
368         return rt;
369 }
370 EXPORT_SYMBOL(ip6_dst_alloc);
371
372 static void ip6_dst_destroy(struct dst_entry *dst)
373 {
374         struct rt6_info *rt = (struct rt6_info *)dst;
375         struct rt6_info *from = rt->from;
376         struct inet6_dev *idev;
377
378         dst_destroy_metrics_generic(dst);
379         rt6_uncached_list_del(rt);
380
381         idev = rt->rt6i_idev;
382         if (idev) {
383                 rt->rt6i_idev = NULL;
384                 in6_dev_put(idev);
385         }
386
387         rt->from = NULL;
388         fib6_info_release(from);
389 }
390
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392                            int how)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct inet6_dev *idev = rt->rt6i_idev;
396         struct net_device *loopback_dev =
397                 dev_net(dev)->loopback_dev;
398
399         if (idev && idev->dev != loopback_dev) {
400                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
401                 if (loopback_idev) {
402                         rt->rt6i_idev = loopback_idev;
403                         in6_dev_put(idev);
404                 }
405         }
406 }
407
408 static bool __rt6_check_expired(const struct rt6_info *rt)
409 {
410         if (rt->rt6i_flags & RTF_EXPIRES)
411                 return time_after(jiffies, rt->dst.expires);
412         else
413                 return false;
414 }
415
416 static bool rt6_check_expired(const struct rt6_info *rt)
417 {
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (rt->from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(rt->from);
424         }
425         return false;
426 }
427
428 static struct rt6_info *rt6_multipath_select(const struct net *net,
429                                              struct rt6_info *match,
430                                              struct flowi6 *fl6, int oif,
431                                              const struct sk_buff *skb,
432                                              int strict)
433 {
434         struct rt6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
446                                  rt6i_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct rt6_info *rt6_device_match(struct net *net,
466                                                     struct rt6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct rt6_info *local = NULL;
472         struct rt6_info *sprt;
473
474         if (!oif && ipv6_addr_any(saddr) &&
475             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
476                 return rt;
477
478         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
479                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480
481                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
482                         continue;
483
484                 if (oif) {
485                         if (dev->ifindex == oif)
486                                 return sprt;
487                         if (dev->flags & IFF_LOOPBACK) {
488                                 if (!sprt->rt6i_idev ||
489                                     sprt->rt6i_idev->dev->ifindex != oif) {
490                                         if (flags & RT6_LOOKUP_F_IFACE)
491                                                 continue;
492                                         if (local &&
493                                             local->rt6i_idev->dev->ifindex == oif)
494                                                 continue;
495                                 }
496                                 local = sprt;
497                         }
498                 } else {
499                         if (ipv6_chk_addr(net, saddr, dev,
500                                           flags & RT6_LOOKUP_F_IFACE))
501                                 return sprt;
502                 }
503         }
504
505         if (oif) {
506                 if (local)
507                         return local;
508
509                 if (flags & RT6_LOOKUP_F_IFACE)
510                         return net->ipv6.fib6_null_entry;
511         }
512
513         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
514 }
515
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518         struct work_struct work;
519         struct in6_addr target;
520         struct net_device *dev;
521 };
522
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525         struct in6_addr mcaddr;
526         struct __rt6_probe_work *work =
527                 container_of(w, struct __rt6_probe_work, work);
528
529         addrconf_addr_solict_mult(&work->target, &mcaddr);
530         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
531         dev_put(work->dev);
532         kfree(work);
533 }
534
535 static void rt6_probe(struct rt6_info *rt)
536 {
537         struct __rt6_probe_work *work;
538         const struct in6_addr *nh_gw;
539         struct neighbour *neigh;
540         struct net_device *dev;
541
542         /*
543          * Okay, this does not seem to be appropriate
544          * for now, however, we need to check if it
545          * is really so; aka Router Reachability Probing.
546          *
547          * Router Reachability Probe MUST be rate-limited
548          * to no more than one per minute.
549          */
550         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
551                 return;
552
553         nh_gw = &rt->fib6_nh.nh_gw;
554         dev = rt->fib6_nh.nh_dev;
555         rcu_read_lock_bh();
556         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
557         if (neigh) {
558                 if (neigh->nud_state & NUD_VALID)
559                         goto out;
560
561                 work = NULL;
562                 write_lock(&neigh->lock);
563                 if (!(neigh->nud_state & NUD_VALID) &&
564                     time_after(jiffies,
565                                neigh->updated +
566                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
567                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
568                         if (work)
569                                 __neigh_set_probe_once(neigh);
570                 }
571                 write_unlock(&neigh->lock);
572         } else {
573                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574         }
575
576         if (work) {
577                 INIT_WORK(&work->work, rt6_probe_deferred);
578                 work->target = *nh_gw;
579                 dev_hold(dev);
580                 work->dev = dev;
581                 schedule_work(&work->work);
582         }
583
584 out:
585         rcu_read_unlock_bh();
586 }
587 #else
588 static inline void rt6_probe(struct rt6_info *rt)
589 {
590 }
591 #endif
592
593 /*
594  * Default Router Selection (RFC 2461 6.3.6)
595  */
596 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
597 {
598         const struct net_device *dev = rt->fib6_nh.nh_dev;
599
600         if (!oif || dev->ifindex == oif)
601                 return 2;
602         if ((dev->flags & IFF_LOOPBACK) &&
603             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
604                 return 1;
605         return 0;
606 }
607
608 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
609 {
610         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611         struct neighbour *neigh;
612
613         if (rt->rt6i_flags & RTF_NONEXTHOP ||
614             !(rt->rt6i_flags & RTF_GATEWAY))
615                 return RT6_NUD_SUCCEED;
616
617         rcu_read_lock_bh();
618         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
619                                           &rt->fib6_nh.nh_gw);
620         if (neigh) {
621                 read_lock(&neigh->lock);
622                 if (neigh->nud_state & NUD_VALID)
623                         ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625                 else if (!(neigh->nud_state & NUD_FAILED))
626                         ret = RT6_NUD_SUCCEED;
627                 else
628                         ret = RT6_NUD_FAIL_PROBE;
629 #endif
630                 read_unlock(&neigh->lock);
631         } else {
632                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634         }
635         rcu_read_unlock_bh();
636
637         return ret;
638 }
639
640 static int rt6_score_route(struct rt6_info *rt, int oif,
641                            int strict)
642 {
643         int m;
644
645         m = rt6_check_dev(rt, oif);
646         if (!m && (strict & RT6_LOOKUP_F_IFACE))
647                 return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
650 #endif
651         if (strict & RT6_LOOKUP_F_REACHABLE) {
652                 int n = rt6_check_neigh(rt);
653                 if (n < 0)
654                         return n;
655         }
656         return m;
657 }
658
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660                                    int *mpri, struct rt6_info *match,
661                                    bool *do_rr)
662 {
663         int m;
664         bool match_do_rr = false;
665         struct inet6_dev *idev = rt->rt6i_idev;
666
667         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
668                 goto out;
669
670         if (idev->cnf.ignore_routes_with_linkdown &&
671             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
672             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
673                 goto out;
674
675         if (fib6_check_expired(rt))
676                 goto out;
677
678         m = rt6_score_route(rt, oif, strict);
679         if (m == RT6_NUD_FAIL_DO_RR) {
680                 match_do_rr = true;
681                 m = 0; /* lowest valid score */
682         } else if (m == RT6_NUD_FAIL_HARD) {
683                 goto out;
684         }
685
686         if (strict & RT6_LOOKUP_F_REACHABLE)
687                 rt6_probe(rt);
688
689         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
690         if (m > *mpri) {
691                 *do_rr = match_do_rr;
692                 *mpri = m;
693                 match = rt;
694         }
695 out:
696         return match;
697 }
698
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700                                      struct rt6_info *leaf,
701                                      struct rt6_info *rr_head,
702                                      u32 metric, int oif, int strict,
703                                      bool *do_rr)
704 {
705         struct rt6_info *rt, *match, *cont;
706         int mpri = -1;
707
708         match = NULL;
709         cont = NULL;
710         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
711                 if (rt->rt6i_metric != metric) {
712                         cont = rt;
713                         break;
714                 }
715
716                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
717         }
718
719         for (rt = leaf; rt && rt != rr_head;
720              rt = rcu_dereference(rt->rt6_next)) {
721                 if (rt->rt6i_metric != metric) {
722                         cont = rt;
723                         break;
724                 }
725
726                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727         }
728
729         if (match || !cont)
730                 return match;
731
732         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
733                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734
735         return match;
736 }
737
738 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
739                                    int oif, int strict)
740 {
741         struct rt6_info *leaf = rcu_dereference(fn->leaf);
742         struct rt6_info *match, *rt0;
743         bool do_rr = false;
744         int key_plen;
745
746         if (!leaf || leaf == net->ipv6.fib6_null_entry)
747                 return net->ipv6.fib6_null_entry;
748
749         rt0 = rcu_dereference(fn->rr_ptr);
750         if (!rt0)
751                 rt0 = leaf;
752
753         /* Double check to make sure fn is not an intermediate node
754          * and fn->leaf does not points to its child's leaf
755          * (This might happen if all routes under fn are deleted from
756          * the tree and fib6_repair_tree() is called on the node.)
757          */
758         key_plen = rt0->rt6i_dst.plen;
759 #ifdef CONFIG_IPV6_SUBTREES
760         if (rt0->rt6i_src.plen)
761                 key_plen = rt0->rt6i_src.plen;
762 #endif
763         if (fn->fn_bit != key_plen)
764                 return net->ipv6.fib6_null_entry;
765
766         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
767                              &do_rr);
768
769         if (do_rr) {
770                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
771
772                 /* no entries matched; do round-robin */
773                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
774                         next = leaf;
775
776                 if (next != rt0) {
777                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
778                         /* make sure next is not being deleted from the tree */
779                         if (next->rt6i_node)
780                                 rcu_assign_pointer(fn->rr_ptr, next);
781                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
782                 }
783         }
784
785         return match ? match : net->ipv6.fib6_null_entry;
786 }
787
788 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
789 {
790         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
791 }
792
793 #ifdef CONFIG_IPV6_ROUTE_INFO
794 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
795                   const struct in6_addr *gwaddr)
796 {
797         struct net *net = dev_net(dev);
798         struct route_info *rinfo = (struct route_info *) opt;
799         struct in6_addr prefix_buf, *prefix;
800         unsigned int pref;
801         unsigned long lifetime;
802         struct rt6_info *rt;
803
804         if (len < sizeof(struct route_info)) {
805                 return -EINVAL;
806         }
807
808         /* Sanity check for prefix_len and length */
809         if (rinfo->length > 3) {
810                 return -EINVAL;
811         } else if (rinfo->prefix_len > 128) {
812                 return -EINVAL;
813         } else if (rinfo->prefix_len > 64) {
814                 if (rinfo->length < 2) {
815                         return -EINVAL;
816                 }
817         } else if (rinfo->prefix_len > 0) {
818                 if (rinfo->length < 1) {
819                         return -EINVAL;
820                 }
821         }
822
823         pref = rinfo->route_pref;
824         if (pref == ICMPV6_ROUTER_PREF_INVALID)
825                 return -EINVAL;
826
827         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
828
829         if (rinfo->length == 3)
830                 prefix = (struct in6_addr *)rinfo->prefix;
831         else {
832                 /* this function is safe */
833                 ipv6_addr_prefix(&prefix_buf,
834                                  (struct in6_addr *)rinfo->prefix,
835                                  rinfo->prefix_len);
836                 prefix = &prefix_buf;
837         }
838
839         if (rinfo->prefix_len == 0)
840                 rt = rt6_get_dflt_router(net, gwaddr, dev);
841         else
842                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
843                                         gwaddr, dev);
844
845         if (rt && !lifetime) {
846                 ip6_del_rt(net, rt);
847                 rt = NULL;
848         }
849
850         if (!rt && lifetime)
851                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
852                                         dev, pref);
853         else if (rt)
854                 rt->rt6i_flags = RTF_ROUTEINFO |
855                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
856
857         if (rt) {
858                 if (!addrconf_finite_timeout(lifetime))
859                         fib6_clean_expires(rt);
860                 else
861                         fib6_set_expires(rt, jiffies + HZ * lifetime);
862
863                 fib6_info_release(rt);
864         }
865         return 0;
866 }
867 #endif
868
869 /*
870  *      Misc support functions
871  */
872
873 /* called with rcu_lock held */
874 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
875 {
876         struct net_device *dev = rt->fib6_nh.nh_dev;
877
878         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
879                 /* for copies of local routes, dst->dev needs to be the
880                  * device if it is a master device, the master device if
881                  * device is enslaved, and the loopback as the default
882                  */
883                 if (netif_is_l3_slave(dev) &&
884                     !rt6_need_strict(&rt->rt6i_dst.addr))
885                         dev = l3mdev_master_dev_rcu(dev);
886                 else if (!netif_is_l3_master(dev))
887                         dev = dev_net(dev)->loopback_dev;
888                 /* last case is netif_is_l3_master(dev) is true in which
889                  * case we want dev returned to be dev
890                  */
891         }
892
893         return dev;
894 }
895
896 static const int fib6_prop[RTN_MAX + 1] = {
897         [RTN_UNSPEC]    = 0,
898         [RTN_UNICAST]   = 0,
899         [RTN_LOCAL]     = 0,
900         [RTN_BROADCAST] = 0,
901         [RTN_ANYCAST]   = 0,
902         [RTN_MULTICAST] = 0,
903         [RTN_BLACKHOLE] = -EINVAL,
904         [RTN_UNREACHABLE] = -EHOSTUNREACH,
905         [RTN_PROHIBIT]  = -EACCES,
906         [RTN_THROW]     = -EAGAIN,
907         [RTN_NAT]       = -EINVAL,
908         [RTN_XRESOLVE]  = -EINVAL,
909 };
910
911 static int ip6_rt_type_to_error(u8 fib6_type)
912 {
913         return fib6_prop[fib6_type];
914 }
915
916 static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
917 {
918         unsigned short flags = 0;
919
920         if (rt->dst_nocount)
921                 flags |= DST_NOCOUNT;
922         if (rt->dst_nopolicy)
923                 flags |= DST_NOPOLICY;
924         if (rt->dst_host)
925                 flags |= DST_HOST;
926
927         return flags;
928 }
929
930 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
931 {
932         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
933
934         switch (ort->fib6_type) {
935         case RTN_BLACKHOLE:
936                 rt->dst.output = dst_discard_out;
937                 rt->dst.input = dst_discard;
938                 break;
939         case RTN_PROHIBIT:
940                 rt->dst.output = ip6_pkt_prohibit_out;
941                 rt->dst.input = ip6_pkt_prohibit;
942                 break;
943         case RTN_THROW:
944         case RTN_UNREACHABLE:
945         default:
946                 rt->dst.output = ip6_pkt_discard_out;
947                 rt->dst.input = ip6_pkt_discard;
948                 break;
949         }
950 }
951
952 static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
953 {
954         rt->dst.flags |= fib6_info_dst_flags(ort);
955
956         if (ort->rt6i_flags & RTF_REJECT) {
957                 ip6_rt_init_dst_reject(rt, ort);
958                 return;
959         }
960
961         rt->dst.error = 0;
962         rt->dst.output = ip6_output;
963
964         if (ort->fib6_type == RTN_LOCAL) {
965                 rt->dst.input = ip6_input;
966         } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
967                 rt->dst.input = ip6_mc_input;
968         } else {
969                 rt->dst.input = ip6_forward;
970         }
971
972         if (ort->fib6_nh.nh_lwtstate) {
973                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
974                 lwtunnel_set_redirect(&rt->dst);
975         }
976
977         rt->dst.lastuse = jiffies;
978 }
979
980 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
981 {
982         rt->rt6i_flags &= ~RTF_EXPIRES;
983         fib6_info_hold(from);
984         rt->from = from;
985         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
986         if (from->fib6_metrics != &dst_default_metrics) {
987                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
988                 refcount_inc(&from->fib6_metrics->refcnt);
989         }
990 }
991
992 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
993 {
994         ip6_rt_init_dst(rt, ort);
995
996         rt->rt6i_dst = ort->rt6i_dst;
997         rt->rt6i_idev = ort->rt6i_idev;
998         if (rt->rt6i_idev)
999                 in6_dev_hold(rt->rt6i_idev);
1000         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1001         rt->rt6i_flags = ort->rt6i_flags;
1002         rt6_set_from(rt, ort);
1003         rt->rt6i_metric = ort->rt6i_metric;
1004 #ifdef CONFIG_IPV6_SUBTREES
1005         rt->rt6i_src = ort->rt6i_src;
1006 #endif
1007         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
1008         rt->rt6i_table = ort->rt6i_table;
1009         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1010 }
1011
1012 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1013                                         struct in6_addr *saddr)
1014 {
1015         struct fib6_node *pn, *sn;
1016         while (1) {
1017                 if (fn->fn_flags & RTN_TL_ROOT)
1018                         return NULL;
1019                 pn = rcu_dereference(fn->parent);
1020                 sn = FIB6_SUBTREE(pn);
1021                 if (sn && sn != fn)
1022                         fn = fib6_lookup(sn, NULL, saddr);
1023                 else
1024                         fn = pn;
1025                 if (fn->fn_flags & RTN_RTINFO)
1026                         return fn;
1027         }
1028 }
1029
1030 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1031                           bool null_fallback)
1032 {
1033         struct rt6_info *rt = *prt;
1034
1035         if (dst_hold_safe(&rt->dst))
1036                 return true;
1037         if (null_fallback) {
1038                 rt = net->ipv6.ip6_null_entry;
1039                 dst_hold(&rt->dst);
1040         } else {
1041                 rt = NULL;
1042         }
1043         *prt = rt;
1044         return false;
1045 }
1046
1047 /* called with rcu_lock held */
1048 static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
1049 {
1050         unsigned short flags = fib6_info_dst_flags(rt);
1051         struct net_device *dev = rt->fib6_nh.nh_dev;
1052         struct rt6_info *nrt;
1053
1054         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1055         if (nrt)
1056                 ip6_rt_copy_init(nrt, rt);
1057
1058         return nrt;
1059 }
1060
1061 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1062                                              struct fib6_table *table,
1063                                              struct flowi6 *fl6,
1064                                              const struct sk_buff *skb,
1065                                              int flags)
1066 {
1067         struct rt6_info *f6i;
1068         struct fib6_node *fn;
1069         struct rt6_info *rt;
1070
1071         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1072                 flags &= ~RT6_LOOKUP_F_IFACE;
1073
1074         rcu_read_lock();
1075         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1076 restart:
1077         f6i = rcu_dereference(fn->leaf);
1078         if (!f6i) {
1079                 f6i = net->ipv6.fib6_null_entry;
1080         } else {
1081                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1082                                       fl6->flowi6_oif, flags);
1083                 if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
1084                         f6i = rt6_multipath_select(net, f6i, fl6,
1085                                                    fl6->flowi6_oif, skb, flags);
1086         }
1087         if (f6i == net->ipv6.fib6_null_entry) {
1088                 fn = fib6_backtrack(fn, &fl6->saddr);
1089                 if (fn)
1090                         goto restart;
1091         }
1092
1093         /* Search through exception table */
1094         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1095         if (rt) {
1096                 if (ip6_hold_safe(net, &rt, true))
1097                         dst_use_noref(&rt->dst, jiffies);
1098         } else if (f6i == net->ipv6.fib6_null_entry) {
1099                 rt = net->ipv6.ip6_null_entry;
1100                 dst_hold(&rt->dst);
1101         } else {
1102                 rt = ip6_create_rt_rcu(f6i);
1103                 if (!rt) {
1104                         rt = net->ipv6.ip6_null_entry;
1105                         dst_hold(&rt->dst);
1106                 }
1107         }
1108
1109         rcu_read_unlock();
1110
1111         trace_fib6_table_lookup(net, rt, table, fl6);
1112
1113         return rt;
1114 }
1115
1116 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1117                                    const struct sk_buff *skb, int flags)
1118 {
1119         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1122
1123 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1124                             const struct in6_addr *saddr, int oif,
1125                             const struct sk_buff *skb, int strict)
1126 {
1127         struct flowi6 fl6 = {
1128                 .flowi6_oif = oif,
1129                 .daddr = *daddr,
1130         };
1131         struct dst_entry *dst;
1132         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1133
1134         if (saddr) {
1135                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1137         }
1138
1139         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1140         if (dst->error == 0)
1141                 return (struct rt6_info *) dst;
1142
1143         dst_release(dst);
1144
1145         return NULL;
1146 }
1147 EXPORT_SYMBOL(rt6_lookup);
1148
1149 /* ip6_ins_rt is called with FREE table->tb6_lock.
1150  * It takes new route entry, the addition fails by any reason the
1151  * route is released.
1152  * Caller must hold dst before calling it.
1153  */
1154
1155 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1156                         struct netlink_ext_ack *extack)
1157 {
1158         int err;
1159         struct fib6_table *table;
1160
1161         table = rt->rt6i_table;
1162         spin_lock_bh(&table->tb6_lock);
1163         err = fib6_add(&table->tb6_root, rt, info, extack);
1164         spin_unlock_bh(&table->tb6_lock);
1165
1166         return err;
1167 }
1168
1169 int ip6_ins_rt(struct net *net, struct rt6_info *rt)
1170 {
1171         struct nl_info info = { .nl_net = net, };
1172
1173         return __ip6_ins_rt(rt, &info, NULL);
1174 }
1175
1176 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1177                                            const struct in6_addr *daddr,
1178                                            const struct in6_addr *saddr)
1179 {
1180         struct net_device *dev;
1181         struct rt6_info *rt;
1182
1183         /*
1184          *      Clone the route.
1185          */
1186
1187         rcu_read_lock();
1188         dev = ip6_rt_get_dev_rcu(ort);
1189         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1190         rcu_read_unlock();
1191         if (!rt)
1192                 return NULL;
1193
1194         ip6_rt_copy_init(rt, ort);
1195         rt->rt6i_flags |= RTF_CACHE;
1196         rt->rt6i_metric = 0;
1197         rt->dst.flags |= DST_HOST;
1198         rt->rt6i_dst.addr = *daddr;
1199         rt->rt6i_dst.plen = 128;
1200
1201         if (!rt6_is_gw_or_nonexthop(ort)) {
1202                 if (ort->rt6i_dst.plen != 128 &&
1203                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1204                         rt->rt6i_flags |= RTF_ANYCAST;
1205 #ifdef CONFIG_IPV6_SUBTREES
1206                 if (rt->rt6i_src.plen && saddr) {
1207                         rt->rt6i_src.addr = *saddr;
1208                         rt->rt6i_src.plen = 128;
1209                 }
1210 #endif
1211         }
1212
1213         return rt;
1214 }
1215
1216 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1217 {
1218         unsigned short flags = fib6_info_dst_flags(rt);
1219         struct net_device *dev;
1220         struct rt6_info *pcpu_rt;
1221
1222         rcu_read_lock();
1223         dev = ip6_rt_get_dev_rcu(rt);
1224         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1225         rcu_read_unlock();
1226         if (!pcpu_rt)
1227                 return NULL;
1228         ip6_rt_copy_init(pcpu_rt, rt);
1229         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct rt6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         rt6_ex->rt6i->rt6i_node = NULL;
1284         hlist_del_rcu(&rt6_ex->hlist);
1285         ip6_rt_put(rt6_ex->rt6i);
1286         kfree_rcu(rt6_ex, rcu);
1287         WARN_ON_ONCE(!bucket->depth);
1288         bucket->depth--;
1289         net->ipv6.rt6_stats->fib_rt_cache--;
1290 }
1291
1292 /* Remove oldest rt6_ex in bucket and free the memory
1293  * Caller must hold rt6_exception_lock
1294  */
1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1296 {
1297         struct rt6_exception *rt6_ex, *oldest = NULL;
1298
1299         if (!bucket)
1300                 return;
1301
1302         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1303                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1304                         oldest = rt6_ex;
1305         }
1306         rt6_remove_exception(bucket, oldest);
1307 }
1308
1309 static u32 rt6_exception_hash(const struct in6_addr *dst,
1310                               const struct in6_addr *src)
1311 {
1312         static u32 seed __read_mostly;
1313         u32 val;
1314
1315         net_get_random_once(&seed, sizeof(seed));
1316         val = jhash(dst, sizeof(*dst), seed);
1317
1318 #ifdef CONFIG_IPV6_SUBTREES
1319         if (src)
1320                 val = jhash(src, sizeof(*src), val);
1321 #endif
1322         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1323 }
1324
1325 /* Helper function to find the cached rt in the hash table
1326  * and update bucket pointer to point to the bucket for this
1327  * (daddr, saddr) pair
1328  * Caller must hold rt6_exception_lock
1329  */
1330 static struct rt6_exception *
1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1332                               const struct in6_addr *daddr,
1333                               const struct in6_addr *saddr)
1334 {
1335         struct rt6_exception *rt6_ex;
1336         u32 hval;
1337
1338         if (!(*bucket) || !daddr)
1339                 return NULL;
1340
1341         hval = rt6_exception_hash(daddr, saddr);
1342         *bucket += hval;
1343
1344         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1345                 struct rt6_info *rt6 = rt6_ex->rt6i;
1346                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1347
1348 #ifdef CONFIG_IPV6_SUBTREES
1349                 if (matched && saddr)
1350                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1351 #endif
1352                 if (matched)
1353                         return rt6_ex;
1354         }
1355         return NULL;
1356 }
1357
1358 /* Helper function to find the cached rt in the hash table
1359  * and update bucket pointer to point to the bucket for this
1360  * (daddr, saddr) pair
1361  * Caller must hold rcu_read_lock()
1362  */
1363 static struct rt6_exception *
1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1365                          const struct in6_addr *daddr,
1366                          const struct in6_addr *saddr)
1367 {
1368         struct rt6_exception *rt6_ex;
1369         u32 hval;
1370
1371         WARN_ON_ONCE(!rcu_read_lock_held());
1372
1373         if (!(*bucket) || !daddr)
1374                 return NULL;
1375
1376         hval = rt6_exception_hash(daddr, saddr);
1377         *bucket += hval;
1378
1379         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1380                 struct rt6_info *rt6 = rt6_ex->rt6i;
1381                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1382
1383 #ifdef CONFIG_IPV6_SUBTREES
1384                 if (matched && saddr)
1385                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1386 #endif
1387                 if (matched)
1388                         return rt6_ex;
1389         }
1390         return NULL;
1391 }
1392
1393 static unsigned int fib6_mtu(const struct rt6_info *rt)
1394 {
1395         unsigned int mtu;
1396
1397         mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
1398         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1399
1400         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1401 }
1402
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404                                 struct rt6_info *ort)
1405 {
1406         struct net *net = dev_net(nrt->dst.dev);
1407         struct rt6_exception_bucket *bucket;
1408         struct in6_addr *src_key = NULL;
1409         struct rt6_exception *rt6_ex;
1410         int err = 0;
1411
1412         spin_lock_bh(&rt6_exception_lock);
1413
1414         if (ort->exception_bucket_flushed) {
1415                 err = -EINVAL;
1416                 goto out;
1417         }
1418
1419         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420                                         lockdep_is_held(&rt6_exception_lock));
1421         if (!bucket) {
1422                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1423                                  GFP_ATOMIC);
1424                 if (!bucket) {
1425                         err = -ENOMEM;
1426                         goto out;
1427                 }
1428                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1429         }
1430
1431 #ifdef CONFIG_IPV6_SUBTREES
1432         /* rt6i_src.plen != 0 indicates ort is in subtree
1433          * and exception table is indexed by a hash of
1434          * both rt6i_dst and rt6i_src.
1435          * Otherwise, the exception table is indexed by
1436          * a hash of only rt6i_dst.
1437          */
1438         if (ort->rt6i_src.plen)
1439                 src_key = &nrt->rt6i_src.addr;
1440 #endif
1441
1442         /* Update rt6i_prefsrc as it could be changed
1443          * in rt6_remove_prefsrc()
1444          */
1445         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1446         /* rt6_mtu_change() might lower mtu on ort.
1447          * Only insert this exception route if its mtu
1448          * is less than ort's mtu value.
1449          */
1450         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1451                 err = -EINVAL;
1452                 goto out;
1453         }
1454
1455         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1456                                                src_key);
1457         if (rt6_ex)
1458                 rt6_remove_exception(bucket, rt6_ex);
1459
1460         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1461         if (!rt6_ex) {
1462                 err = -ENOMEM;
1463                 goto out;
1464         }
1465         rt6_ex->rt6i = nrt;
1466         rt6_ex->stamp = jiffies;
1467         atomic_inc(&nrt->rt6i_ref);
1468         nrt->rt6i_node = ort->rt6i_node;
1469         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1470         bucket->depth++;
1471         net->ipv6.rt6_stats->fib_rt_cache++;
1472
1473         if (bucket->depth > FIB6_MAX_DEPTH)
1474                 rt6_exception_remove_oldest(bucket);
1475
1476 out:
1477         spin_unlock_bh(&rt6_exception_lock);
1478
1479         /* Update fn->fn_sernum to invalidate all cached dst */
1480         if (!err) {
1481                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1482                 fib6_update_sernum(net, ort);
1483                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1484                 fib6_force_start_gc(net);
1485         }
1486
1487         return err;
1488 }
1489
1490 void rt6_flush_exceptions(struct rt6_info *rt)
1491 {
1492         struct rt6_exception_bucket *bucket;
1493         struct rt6_exception *rt6_ex;
1494         struct hlist_node *tmp;
1495         int i;
1496
1497         spin_lock_bh(&rt6_exception_lock);
1498         /* Prevent rt6_insert_exception() to recreate the bucket list */
1499         rt->exception_bucket_flushed = 1;
1500
1501         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1502                                     lockdep_is_held(&rt6_exception_lock));
1503         if (!bucket)
1504                 goto out;
1505
1506         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1507                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1508                         rt6_remove_exception(bucket, rt6_ex);
1509                 WARN_ON_ONCE(bucket->depth);
1510                 bucket++;
1511         }
1512
1513 out:
1514         spin_unlock_bh(&rt6_exception_lock);
1515 }
1516
1517 /* Find cached rt in the hash table inside passed in rt
1518  * Caller has to hold rcu_read_lock()
1519  */
1520 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1521                                            struct in6_addr *daddr,
1522                                            struct in6_addr *saddr)
1523 {
1524         struct rt6_exception_bucket *bucket;
1525         struct in6_addr *src_key = NULL;
1526         struct rt6_exception *rt6_ex;
1527         struct rt6_info *res = NULL;
1528
1529         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1530
1531 #ifdef CONFIG_IPV6_SUBTREES
1532         /* rt6i_src.plen != 0 indicates rt is in subtree
1533          * and exception table is indexed by a hash of
1534          * both rt6i_dst and rt6i_src.
1535          * Otherwise, the exception table is indexed by
1536          * a hash of only rt6i_dst.
1537          */
1538         if (rt->rt6i_src.plen)
1539                 src_key = saddr;
1540 #endif
1541         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1542
1543         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1544                 res = rt6_ex->rt6i;
1545
1546         return res;
1547 }
1548
1549 /* Remove the passed in cached rt from the hash table that contains it */
1550 static int rt6_remove_exception_rt(struct rt6_info *rt)
1551 {
1552         struct rt6_exception_bucket *bucket;
1553         struct rt6_info *from = rt->from;
1554         struct in6_addr *src_key = NULL;
1555         struct rt6_exception *rt6_ex;
1556         int err;
1557
1558         if (!from ||
1559             !(rt->rt6i_flags & RTF_CACHE))
1560                 return -EINVAL;
1561
1562         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1563                 return -ENOENT;
1564
1565         spin_lock_bh(&rt6_exception_lock);
1566         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1567                                     lockdep_is_held(&rt6_exception_lock));
1568 #ifdef CONFIG_IPV6_SUBTREES
1569         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1570          * and exception table is indexed by a hash of
1571          * both rt6i_dst and rt6i_src.
1572          * Otherwise, the exception table is indexed by
1573          * a hash of only rt6i_dst.
1574          */
1575         if (from->rt6i_src.plen)
1576                 src_key = &rt->rt6i_src.addr;
1577 #endif
1578         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1579                                                &rt->rt6i_dst.addr,
1580                                                src_key);
1581         if (rt6_ex) {
1582                 rt6_remove_exception(bucket, rt6_ex);
1583                 err = 0;
1584         } else {
1585                 err = -ENOENT;
1586         }
1587
1588         spin_unlock_bh(&rt6_exception_lock);
1589         return err;
1590 }
1591
1592 /* Find rt6_ex which contains the passed in rt cache and
1593  * refresh its stamp
1594  */
1595 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1596 {
1597         struct rt6_exception_bucket *bucket;
1598         struct rt6_info *from = rt->from;
1599         struct in6_addr *src_key = NULL;
1600         struct rt6_exception *rt6_ex;
1601
1602         if (!from ||
1603             !(rt->rt6i_flags & RTF_CACHE))
1604                 return;
1605
1606         rcu_read_lock();
1607         bucket = rcu_dereference(from->rt6i_exception_bucket);
1608
1609 #ifdef CONFIG_IPV6_SUBTREES
1610         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1611          * and exception table is indexed by a hash of
1612          * both rt6i_dst and rt6i_src.
1613          * Otherwise, the exception table is indexed by
1614          * a hash of only rt6i_dst.
1615          */
1616         if (from->rt6i_src.plen)
1617                 src_key = &rt->rt6i_src.addr;
1618 #endif
1619         rt6_ex = __rt6_find_exception_rcu(&bucket,
1620                                           &rt->rt6i_dst.addr,
1621                                           src_key);
1622         if (rt6_ex)
1623                 rt6_ex->stamp = jiffies;
1624
1625         rcu_read_unlock();
1626 }
1627
1628 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1629 {
1630         struct rt6_exception_bucket *bucket;
1631         struct rt6_exception *rt6_ex;
1632         int i;
1633
1634         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1635                                         lockdep_is_held(&rt6_exception_lock));
1636
1637         if (bucket) {
1638                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1639                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1640                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1641                         }
1642                         bucket++;
1643                 }
1644         }
1645 }
1646
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648                                          struct rt6_info *rt, int mtu)
1649 {
1650         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1651          * lowest MTU in the path: always allow updating the route PMTU to
1652          * reflect PMTU decreases.
1653          *
1654          * If the new MTU is higher, and the route PMTU is equal to the local
1655          * MTU, this means the old MTU is the lowest in the path, so allow
1656          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657          * handle this.
1658          */
1659
1660         if (dst_mtu(&rt->dst) >= mtu)
1661                 return true;
1662
1663         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1664                 return true;
1665
1666         return false;
1667 }
1668
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670                                        struct rt6_info *rt, int mtu)
1671 {
1672         struct rt6_exception_bucket *bucket;
1673         struct rt6_exception *rt6_ex;
1674         int i;
1675
1676         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677                                         lockdep_is_held(&rt6_exception_lock));
1678
1679         if (!bucket)
1680                 return;
1681
1682         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684                         struct rt6_info *entry = rt6_ex->rt6i;
1685
1686                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687                          * route), the metrics of its rt->from have already
1688                          * been updated.
1689                          */
1690                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1692                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693                 }
1694                 bucket++;
1695         }
1696 }
1697
1698 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1699
1700 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1701                                         struct in6_addr *gateway)
1702 {
1703         struct rt6_exception_bucket *bucket;
1704         struct rt6_exception *rt6_ex;
1705         struct hlist_node *tmp;
1706         int i;
1707
1708         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1709                 return;
1710
1711         spin_lock_bh(&rt6_exception_lock);
1712         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713                                      lockdep_is_held(&rt6_exception_lock));
1714
1715         if (bucket) {
1716                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717                         hlist_for_each_entry_safe(rt6_ex, tmp,
1718                                                   &bucket->chain, hlist) {
1719                                 struct rt6_info *entry = rt6_ex->rt6i;
1720
1721                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722                                     RTF_CACHE_GATEWAY &&
1723                                     ipv6_addr_equal(gateway,
1724                                                     &entry->rt6i_gateway)) {
1725                                         rt6_remove_exception(bucket, rt6_ex);
1726                                 }
1727                         }
1728                         bucket++;
1729                 }
1730         }
1731
1732         spin_unlock_bh(&rt6_exception_lock);
1733 }
1734
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736                                       struct rt6_exception *rt6_ex,
1737                                       struct fib6_gc_args *gc_args,
1738                                       unsigned long now)
1739 {
1740         struct rt6_info *rt = rt6_ex->rt6i;
1741
1742         /* we are pruning and obsoleting aged-out and non gateway exceptions
1743          * even if others have still references to them, so that on next
1744          * dst_check() such references can be dropped.
1745          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746          * expired, independently from their aging, as per RFC 8201 section 4
1747          */
1748         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750                         RT6_TRACE("aging clone %p\n", rt);
1751                         rt6_remove_exception(bucket, rt6_ex);
1752                         return;
1753                 }
1754         } else if (time_after(jiffies, rt->dst.expires)) {
1755                 RT6_TRACE("purging expired route %p\n", rt);
1756                 rt6_remove_exception(bucket, rt6_ex);
1757                 return;
1758         }
1759
1760         if (rt->rt6i_flags & RTF_GATEWAY) {
1761                 struct neighbour *neigh;
1762                 __u8 neigh_flags = 0;
1763
1764                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765                 if (neigh)
1766                         neigh_flags = neigh->flags;
1767
1768                 if (!(neigh_flags & NTF_ROUTER)) {
1769                         RT6_TRACE("purging route %p via non-router but gateway\n",
1770                                   rt);
1771                         rt6_remove_exception(bucket, rt6_ex);
1772                         return;
1773                 }
1774         }
1775
1776         gc_args->more++;
1777 }
1778
1779 void rt6_age_exceptions(struct rt6_info *rt,
1780                         struct fib6_gc_args *gc_args,
1781                         unsigned long now)
1782 {
1783         struct rt6_exception_bucket *bucket;
1784         struct rt6_exception *rt6_ex;
1785         struct hlist_node *tmp;
1786         int i;
1787
1788         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789                 return;
1790
1791         rcu_read_lock_bh();
1792         spin_lock(&rt6_exception_lock);
1793         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794                                     lockdep_is_held(&rt6_exception_lock));
1795
1796         if (bucket) {
1797                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798                         hlist_for_each_entry_safe(rt6_ex, tmp,
1799                                                   &bucket->chain, hlist) {
1800                                 rt6_age_examine_exception(bucket, rt6_ex,
1801                                                           gc_args, now);
1802                         }
1803                         bucket++;
1804                 }
1805         }
1806         spin_unlock(&rt6_exception_lock);
1807         rcu_read_unlock_bh();
1808 }
1809
1810 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1811                                int oif, struct flowi6 *fl6,
1812                                const struct sk_buff *skb, int flags)
1813 {
1814         struct fib6_node *fn, *saved_fn;
1815         struct rt6_info *f6i;
1816         struct rt6_info *rt;
1817         int strict = 0;
1818
1819         strict |= flags & RT6_LOOKUP_F_IFACE;
1820         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1821         if (net->ipv6.devconf_all->forwarding == 0)
1822                 strict |= RT6_LOOKUP_F_REACHABLE;
1823
1824         rcu_read_lock();
1825
1826         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1827         saved_fn = fn;
1828
1829         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1830                 oif = 0;
1831
1832 redo_rt6_select:
1833         f6i = rt6_select(net, fn, oif, strict);
1834         if (f6i->rt6i_nsiblings)
1835                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1836         if (f6i == net->ipv6.fib6_null_entry) {
1837                 fn = fib6_backtrack(fn, &fl6->saddr);
1838                 if (fn)
1839                         goto redo_rt6_select;
1840                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1841                         /* also consider unreachable route */
1842                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1843                         fn = saved_fn;
1844                         goto redo_rt6_select;
1845                 }
1846         }
1847
1848         if (f6i == net->ipv6.fib6_null_entry) {
1849                 rt = net->ipv6.ip6_null_entry;
1850                 rcu_read_unlock();
1851                 dst_hold(&rt->dst);
1852                 trace_fib6_table_lookup(net, rt, table, fl6);
1853                 return rt;
1854         }
1855
1856         /*Search through exception table */
1857         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1858         if (rt) {
1859                 if (ip6_hold_safe(net, &rt, true))
1860                         dst_use_noref(&rt->dst, jiffies);
1861
1862                 rcu_read_unlock();
1863                 trace_fib6_table_lookup(net, rt, table, fl6);
1864                 return rt;
1865         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1866                             !(f6i->rt6i_flags & RTF_GATEWAY))) {
1867                 /* Create a RTF_CACHE clone which will not be
1868                  * owned by the fib6 tree.  It is for the special case where
1869                  * the daddr in the skb during the neighbor look-up is different
1870                  * from the fl6->daddr used to look-up route here.
1871                  */
1872
1873                 struct rt6_info *uncached_rt;
1874
1875                 fib6_info_hold(f6i);
1876                 rcu_read_unlock();
1877
1878                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1879                 fib6_info_release(f6i);
1880
1881                 if (uncached_rt) {
1882                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1883                          * No need for another dst_hold()
1884                          */
1885                         rt6_uncached_list_add(uncached_rt);
1886                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1887                 } else {
1888                         uncached_rt = net->ipv6.ip6_null_entry;
1889                         dst_hold(&uncached_rt->dst);
1890                 }
1891
1892                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1893                 return uncached_rt;
1894
1895         } else {
1896                 /* Get a percpu copy */
1897
1898                 struct rt6_info *pcpu_rt;
1899
1900                 local_bh_disable();
1901                 pcpu_rt = rt6_get_pcpu_route(f6i);
1902
1903                 if (!pcpu_rt)
1904                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1905
1906                 local_bh_enable();
1907                 rcu_read_unlock();
1908                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1909                 return pcpu_rt;
1910         }
1911 }
1912 EXPORT_SYMBOL_GPL(ip6_pol_route);
1913
1914 static struct rt6_info *ip6_pol_route_input(struct net *net,
1915                                             struct fib6_table *table,
1916                                             struct flowi6 *fl6,
1917                                             const struct sk_buff *skb,
1918                                             int flags)
1919 {
1920         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1921 }
1922
1923 struct dst_entry *ip6_route_input_lookup(struct net *net,
1924                                          struct net_device *dev,
1925                                          struct flowi6 *fl6,
1926                                          const struct sk_buff *skb,
1927                                          int flags)
1928 {
1929         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1930                 flags |= RT6_LOOKUP_F_IFACE;
1931
1932         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1933 }
1934 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1935
1936 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1937                                   struct flow_keys *keys,
1938                                   struct flow_keys *flkeys)
1939 {
1940         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1941         const struct ipv6hdr *key_iph = outer_iph;
1942         struct flow_keys *_flkeys = flkeys;
1943         const struct ipv6hdr *inner_iph;
1944         const struct icmp6hdr *icmph;
1945         struct ipv6hdr _inner_iph;
1946
1947         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1948                 goto out;
1949
1950         icmph = icmp6_hdr(skb);
1951         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1952             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1953             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1954             icmph->icmp6_type != ICMPV6_PARAMPROB)
1955                 goto out;
1956
1957         inner_iph = skb_header_pointer(skb,
1958                                        skb_transport_offset(skb) + sizeof(*icmph),
1959                                        sizeof(_inner_iph), &_inner_iph);
1960         if (!inner_iph)
1961                 goto out;
1962
1963         key_iph = inner_iph;
1964         _flkeys = NULL;
1965 out:
1966         if (_flkeys) {
1967                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1968                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1969                 keys->tags.flow_label = _flkeys->tags.flow_label;
1970                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1971         } else {
1972                 keys->addrs.v6addrs.src = key_iph->saddr;
1973                 keys->addrs.v6addrs.dst = key_iph->daddr;
1974                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1975                 keys->basic.ip_proto = key_iph->nexthdr;
1976         }
1977 }
1978
1979 /* if skb is set it will be used and fl6 can be NULL */
1980 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1981                        const struct sk_buff *skb, struct flow_keys *flkeys)
1982 {
1983         struct flow_keys hash_keys;
1984         u32 mhash;
1985
1986         switch (ip6_multipath_hash_policy(net)) {
1987         case 0:
1988                 memset(&hash_keys, 0, sizeof(hash_keys));
1989                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1990                 if (skb) {
1991                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1992                 } else {
1993                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1994                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1995                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1996                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1997                 }
1998                 break;
1999         case 1:
2000                 if (skb) {
2001                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2002                         struct flow_keys keys;
2003
2004                         /* short-circuit if we already have L4 hash present */
2005                         if (skb->l4_hash)
2006                                 return skb_get_hash_raw(skb) >> 1;
2007
2008                         memset(&hash_keys, 0, sizeof(hash_keys));
2009
2010                         if (!flkeys) {
2011                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2012                                 flkeys = &keys;
2013                         }
2014                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2015                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2016                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2017                         hash_keys.ports.src = flkeys->ports.src;
2018                         hash_keys.ports.dst = flkeys->ports.dst;
2019                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2020                 } else {
2021                         memset(&hash_keys, 0, sizeof(hash_keys));
2022                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2023                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2024                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2025                         hash_keys.ports.src = fl6->fl6_sport;
2026                         hash_keys.ports.dst = fl6->fl6_dport;
2027                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2028                 }
2029                 break;
2030         }
2031         mhash = flow_hash_from_keys(&hash_keys);
2032
2033         return mhash >> 1;
2034 }
2035
2036 void ip6_route_input(struct sk_buff *skb)
2037 {
2038         const struct ipv6hdr *iph = ipv6_hdr(skb);
2039         struct net *net = dev_net(skb->dev);
2040         int flags = RT6_LOOKUP_F_HAS_SADDR;
2041         struct ip_tunnel_info *tun_info;
2042         struct flowi6 fl6 = {
2043                 .flowi6_iif = skb->dev->ifindex,
2044                 .daddr = iph->daddr,
2045                 .saddr = iph->saddr,
2046                 .flowlabel = ip6_flowinfo(iph),
2047                 .flowi6_mark = skb->mark,
2048                 .flowi6_proto = iph->nexthdr,
2049         };
2050         struct flow_keys *flkeys = NULL, _flkeys;
2051
2052         tun_info = skb_tunnel_info(skb);
2053         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2054                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2055
2056         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2057                 flkeys = &_flkeys;
2058
2059         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2060                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2061         skb_dst_drop(skb);
2062         skb_dst_set(skb,
2063                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2064 }
2065
2066 static struct rt6_info *ip6_pol_route_output(struct net *net,
2067                                              struct fib6_table *table,
2068                                              struct flowi6 *fl6,
2069                                              const struct sk_buff *skb,
2070                                              int flags)
2071 {
2072         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2073 }
2074
2075 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2076                                          struct flowi6 *fl6, int flags)
2077 {
2078         bool any_src;
2079
2080         if (rt6_need_strict(&fl6->daddr)) {
2081                 struct dst_entry *dst;
2082
2083                 dst = l3mdev_link_scope_lookup(net, fl6);
2084                 if (dst)
2085                         return dst;
2086         }
2087
2088         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2089
2090         any_src = ipv6_addr_any(&fl6->saddr);
2091         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2092             (fl6->flowi6_oif && any_src))
2093                 flags |= RT6_LOOKUP_F_IFACE;
2094
2095         if (!any_src)
2096                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2097         else if (sk)
2098                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2099
2100         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2101 }
2102 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2103
2104 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105 {
2106         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2107         struct net_device *loopback_dev = net->loopback_dev;
2108         struct dst_entry *new = NULL;
2109
2110         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2111                        DST_OBSOLETE_DEAD, 0);
2112         if (rt) {
2113                 rt6_info_init(rt);
2114                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2115
2116                 new = &rt->dst;
2117                 new->__use = 1;
2118                 new->input = dst_discard;
2119                 new->output = dst_discard_out;
2120
2121                 dst_copy_metrics(new, &ort->dst);
2122
2123                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2124                 rt->rt6i_gateway = ort->rt6i_gateway;
2125                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2126                 rt->rt6i_metric = 0;
2127
2128                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2129 #ifdef CONFIG_IPV6_SUBTREES
2130                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2131 #endif
2132         }
2133
2134         dst_release(dst_orig);
2135         return new ? new : ERR_PTR(-ENOMEM);
2136 }
2137
2138 /*
2139  *      Destination cache support functions
2140  */
2141
2142 static bool fib6_check(struct rt6_info *f6i, u32 cookie)
2143 {
2144         u32 rt_cookie = 0;
2145
2146         if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
2147              rt_cookie != cookie)
2148                 return false;
2149
2150         if (fib6_check_expired(f6i))
2151                 return false;
2152
2153         return true;
2154 }
2155
2156 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2157 {
2158         u32 rt_cookie = 0;
2159
2160         if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
2161             rt_cookie != cookie)
2162                 return NULL;
2163
2164         if (rt6_check_expired(rt))
2165                 return NULL;
2166
2167         return &rt->dst;
2168 }
2169
2170 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2171 {
2172         if (!__rt6_check_expired(rt) &&
2173             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174             fib6_check(rt->from, cookie))
2175                 return &rt->dst;
2176         else
2177                 return NULL;
2178 }
2179
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2181 {
2182         struct rt6_info *rt;
2183
2184         rt = (struct rt6_info *) dst;
2185
2186         /* All IPV6 dsts are created with ->obsolete set to the value
2187          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2188          * into this function always.
2189          */
2190
2191         if (rt->rt6i_flags & RTF_PCPU ||
2192             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2193                 return rt6_dst_from_check(rt, cookie);
2194         else
2195                 return rt6_check(rt, cookie);
2196 }
2197
2198 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2199 {
2200         struct rt6_info *rt = (struct rt6_info *) dst;
2201
2202         if (rt) {
2203                 if (rt->rt6i_flags & RTF_CACHE) {
2204                         if (rt6_check_expired(rt)) {
2205                                 rt6_remove_exception_rt(rt);
2206                                 dst = NULL;
2207                         }
2208                 } else {
2209                         dst_release(dst);
2210                         dst = NULL;
2211                 }
2212         }
2213         return dst;
2214 }
2215
2216 static void ip6_link_failure(struct sk_buff *skb)
2217 {
2218         struct rt6_info *rt;
2219
2220         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2221
2222         rt = (struct rt6_info *) skb_dst(skb);
2223         if (rt) {
2224                 if (rt->rt6i_flags & RTF_CACHE) {
2225                         if (dst_hold_safe(&rt->dst))
2226                                 rt6_remove_exception_rt(rt);
2227                 } else if (rt->from) {
2228                         struct fib6_node *fn;
2229
2230                         rcu_read_lock();
2231                         fn = rcu_dereference(rt->from->rt6i_node);
2232                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2233                                 fn->fn_sernum = -1;
2234                         rcu_read_unlock();
2235                 }
2236         }
2237 }
2238
2239 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2240 {
2241         struct net *net = dev_net(rt->dst.dev);
2242
2243         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2244         rt->rt6i_flags |= RTF_MODIFIED;
2245         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2246 }
2247
2248 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2249 {
2250         return !(rt->rt6i_flags & RTF_CACHE) &&
2251                 (rt->rt6i_flags & RTF_PCPU ||
2252                  rcu_access_pointer(rt->rt6i_node));
2253 }
2254
2255 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2256                                  const struct ipv6hdr *iph, u32 mtu)
2257 {
2258         const struct in6_addr *daddr, *saddr;
2259         struct rt6_info *rt6 = (struct rt6_info *)dst;
2260
2261         if (rt6->rt6i_flags & RTF_LOCAL)
2262                 return;
2263
2264         if (dst_metric_locked(dst, RTAX_MTU))
2265                 return;
2266
2267         if (iph) {
2268                 daddr = &iph->daddr;
2269                 saddr = &iph->saddr;
2270         } else if (sk) {
2271                 daddr = &sk->sk_v6_daddr;
2272                 saddr = &inet6_sk(sk)->saddr;
2273         } else {
2274                 daddr = NULL;
2275                 saddr = NULL;
2276         }
2277         dst_confirm_neigh(dst, daddr);
2278         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2279         if (mtu >= dst_mtu(dst))
2280                 return;
2281
2282         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2283                 rt6_do_update_pmtu(rt6, mtu);
2284                 /* update rt6_ex->stamp for cache */
2285                 if (rt6->rt6i_flags & RTF_CACHE)
2286                         rt6_update_exception_stamp_rt(rt6);
2287         } else if (daddr) {
2288                 struct rt6_info *nrt6;
2289
2290                 nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2291                 if (nrt6) {
2292                         rt6_do_update_pmtu(nrt6, mtu);
2293                         if (rt6_insert_exception(nrt6, rt6->from))
2294                                 dst_release_immediate(&nrt6->dst);
2295                 }
2296         }
2297 }
2298
2299 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2300                                struct sk_buff *skb, u32 mtu)
2301 {
2302         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2303 }
2304
2305 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2306                      int oif, u32 mark, kuid_t uid)
2307 {
2308         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2309         struct dst_entry *dst;
2310         struct flowi6 fl6;
2311
2312         memset(&fl6, 0, sizeof(fl6));
2313         fl6.flowi6_oif = oif;
2314         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2315         fl6.daddr = iph->daddr;
2316         fl6.saddr = iph->saddr;
2317         fl6.flowlabel = ip6_flowinfo(iph);
2318         fl6.flowi6_uid = uid;
2319
2320         dst = ip6_route_output(net, NULL, &fl6);
2321         if (!dst->error)
2322                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2323         dst_release(dst);
2324 }
2325 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2326
2327 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2328 {
2329         struct dst_entry *dst;
2330
2331         ip6_update_pmtu(skb, sock_net(sk), mtu,
2332                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2333
2334         dst = __sk_dst_get(sk);
2335         if (!dst || !dst->obsolete ||
2336             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2337                 return;
2338
2339         bh_lock_sock(sk);
2340         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2341                 ip6_datagram_dst_update(sk, false);
2342         bh_unlock_sock(sk);
2343 }
2344 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2345
2346 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2347                            const struct flowi6 *fl6)
2348 {
2349 #ifdef CONFIG_IPV6_SUBTREES
2350         struct ipv6_pinfo *np = inet6_sk(sk);
2351 #endif
2352
2353         ip6_dst_store(sk, dst,
2354                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2355                       &sk->sk_v6_daddr : NULL,
2356 #ifdef CONFIG_IPV6_SUBTREES
2357                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2358                       &np->saddr :
2359 #endif
2360                       NULL);
2361 }
2362
2363 /* Handle redirects */
2364 struct ip6rd_flowi {
2365         struct flowi6 fl6;
2366         struct in6_addr gateway;
2367 };
2368
2369 static struct rt6_info *__ip6_route_redirect(struct net *net,
2370                                              struct fib6_table *table,
2371                                              struct flowi6 *fl6,
2372                                              const struct sk_buff *skb,
2373                                              int flags)
2374 {
2375         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2376         struct rt6_info *ret = NULL, *rt_cache;
2377         struct rt6_info *rt;
2378         struct fib6_node *fn;
2379
2380         /* Get the "current" route for this destination and
2381          * check if the redirect has come from appropriate router.
2382          *
2383          * RFC 4861 specifies that redirects should only be
2384          * accepted if they come from the nexthop to the target.
2385          * Due to the way the routes are chosen, this notion
2386          * is a bit fuzzy and one might need to check all possible
2387          * routes.
2388          */
2389
2390         rcu_read_lock();
2391         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2392 restart:
2393         for_each_fib6_node_rt_rcu(fn) {
2394                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2395                         continue;
2396                 if (fib6_check_expired(rt))
2397                         continue;
2398                 if (rt->rt6i_flags & RTF_REJECT)
2399                         break;
2400                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2401                         continue;
2402                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2403                         continue;
2404                 /* rt_cache's gateway might be different from its 'parent'
2405                  * in the case of an ip redirect.
2406                  * So we keep searching in the exception table if the gateway
2407                  * is different.
2408                  */
2409                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2410                         rt_cache = rt6_find_cached_rt(rt,
2411                                                       &fl6->daddr,
2412                                                       &fl6->saddr);
2413                         if (rt_cache &&
2414                             ipv6_addr_equal(&rdfl->gateway,
2415                                             &rt_cache->rt6i_gateway)) {
2416                                 ret = rt_cache;
2417                                 break;
2418                         }
2419                         continue;
2420                 }
2421                 break;
2422         }
2423
2424         if (!rt)
2425                 rt = net->ipv6.fib6_null_entry;
2426         else if (rt->rt6i_flags & RTF_REJECT) {
2427                 ret = net->ipv6.ip6_null_entry;
2428                 goto out;
2429         }
2430
2431         if (rt == net->ipv6.fib6_null_entry) {
2432                 fn = fib6_backtrack(fn, &fl6->saddr);
2433                 if (fn)
2434                         goto restart;
2435         }
2436
2437 out:
2438         if (ret)
2439                 dst_hold(&ret->dst);
2440         else
2441                 ret = ip6_create_rt_rcu(rt);
2442
2443         rcu_read_unlock();
2444
2445         trace_fib6_table_lookup(net, ret, table, fl6);
2446         return ret;
2447 };
2448
2449 static struct dst_entry *ip6_route_redirect(struct net *net,
2450                                             const struct flowi6 *fl6,
2451                                             const struct sk_buff *skb,
2452                                             const struct in6_addr *gateway)
2453 {
2454         int flags = RT6_LOOKUP_F_HAS_SADDR;
2455         struct ip6rd_flowi rdfl;
2456
2457         rdfl.fl6 = *fl6;
2458         rdfl.gateway = *gateway;
2459
2460         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2461                                 flags, __ip6_route_redirect);
2462 }
2463
2464 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2465                   kuid_t uid)
2466 {
2467         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2468         struct dst_entry *dst;
2469         struct flowi6 fl6;
2470
2471         memset(&fl6, 0, sizeof(fl6));
2472         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2473         fl6.flowi6_oif = oif;
2474         fl6.flowi6_mark = mark;
2475         fl6.daddr = iph->daddr;
2476         fl6.saddr = iph->saddr;
2477         fl6.flowlabel = ip6_flowinfo(iph);
2478         fl6.flowi6_uid = uid;
2479
2480         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2481         rt6_do_redirect(dst, NULL, skb);
2482         dst_release(dst);
2483 }
2484 EXPORT_SYMBOL_GPL(ip6_redirect);
2485
2486 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2487                             u32 mark)
2488 {
2489         const struct ipv6hdr *iph = ipv6_hdr(skb);
2490         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2491         struct dst_entry *dst;
2492         struct flowi6 fl6;
2493
2494         memset(&fl6, 0, sizeof(fl6));
2495         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2496         fl6.flowi6_oif = oif;
2497         fl6.flowi6_mark = mark;
2498         fl6.daddr = msg->dest;
2499         fl6.saddr = iph->daddr;
2500         fl6.flowi6_uid = sock_net_uid(net, NULL);
2501
2502         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2503         rt6_do_redirect(dst, NULL, skb);
2504         dst_release(dst);
2505 }
2506
2507 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2508 {
2509         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2510                      sk->sk_uid);
2511 }
2512 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2513
2514 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2515 {
2516         struct net_device *dev = dst->dev;
2517         unsigned int mtu = dst_mtu(dst);
2518         struct net *net = dev_net(dev);
2519
2520         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2521
2522         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2523                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2524
2525         /*
2526          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2527          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2528          * IPV6_MAXPLEN is also valid and means: "any MSS,
2529          * rely only on pmtu discovery"
2530          */
2531         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2532                 mtu = IPV6_MAXPLEN;
2533         return mtu;
2534 }
2535
2536 static unsigned int ip6_mtu(const struct dst_entry *dst)
2537 {
2538         struct inet6_dev *idev;
2539         unsigned int mtu;
2540
2541         mtu = dst_metric_raw(dst, RTAX_MTU);
2542         if (mtu)
2543                 goto out;
2544
2545         mtu = IPV6_MIN_MTU;
2546
2547         rcu_read_lock();
2548         idev = __in6_dev_get(dst->dev);
2549         if (idev)
2550                 mtu = idev->cnf.mtu6;
2551         rcu_read_unlock();
2552
2553 out:
2554         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2555
2556         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2557 }
2558
2559 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2560                                   struct flowi6 *fl6)
2561 {
2562         struct dst_entry *dst;
2563         struct rt6_info *rt;
2564         struct inet6_dev *idev = in6_dev_get(dev);
2565         struct net *net = dev_net(dev);
2566
2567         if (unlikely(!idev))
2568                 return ERR_PTR(-ENODEV);
2569
2570         rt = ip6_dst_alloc(net, dev, 0);
2571         if (unlikely(!rt)) {
2572                 in6_dev_put(idev);
2573                 dst = ERR_PTR(-ENOMEM);
2574                 goto out;
2575         }
2576
2577         rt->dst.flags |= DST_HOST;
2578         rt->dst.input = ip6_input;
2579         rt->dst.output  = ip6_output;
2580         rt->rt6i_gateway  = fl6->daddr;
2581         rt->rt6i_dst.addr = fl6->daddr;
2582         rt->rt6i_dst.plen = 128;
2583         rt->rt6i_idev     = idev;
2584         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2585
2586         /* Add this dst into uncached_list so that rt6_disable_ip() can
2587          * do proper release of the net_device
2588          */
2589         rt6_uncached_list_add(rt);
2590         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2591
2592         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2593
2594 out:
2595         return dst;
2596 }
2597
2598 static int ip6_dst_gc(struct dst_ops *ops)
2599 {
2600         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2601         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2602         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2603         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2604         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2605         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2606         int entries;
2607
2608         entries = dst_entries_get_fast(ops);
2609         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2610             entries <= rt_max_size)
2611                 goto out;
2612
2613         net->ipv6.ip6_rt_gc_expire++;
2614         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2615         entries = dst_entries_get_slow(ops);
2616         if (entries < ops->gc_thresh)
2617                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2618 out:
2619         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2620         return entries > rt_max_size;
2621 }
2622
2623 static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
2624                                struct fib6_config *cfg)
2625 {
2626         int err = 0;
2627
2628         if (cfg->fc_mx) {
2629                 rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
2630                                            GFP_KERNEL);
2631                 if (unlikely(!rt->fib6_metrics))
2632                         return -ENOMEM;
2633
2634                 refcount_set(&rt->fib6_metrics->refcnt, 1);
2635
2636                 err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
2637                                          rt->fib6_metrics->metrics);
2638         }
2639
2640         return err;
2641 }
2642
2643 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2644                                             struct fib6_config *cfg,
2645                                             const struct in6_addr *gw_addr,
2646                                             u32 tbid, int flags)
2647 {
2648         struct flowi6 fl6 = {
2649                 .flowi6_oif = cfg->fc_ifindex,
2650                 .daddr = *gw_addr,
2651                 .saddr = cfg->fc_prefsrc,
2652         };
2653         struct fib6_table *table;
2654         struct rt6_info *rt;
2655
2656         table = fib6_get_table(net, tbid);
2657         if (!table)
2658                 return NULL;
2659
2660         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2661                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2662
2663         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2664         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2665
2666         /* if table lookup failed, fall back to full lookup */
2667         if (rt == net->ipv6.ip6_null_entry) {
2668                 ip6_rt_put(rt);
2669                 rt = NULL;
2670         }
2671
2672         return rt;
2673 }
2674
2675 static int ip6_route_check_nh_onlink(struct net *net,
2676                                      struct fib6_config *cfg,
2677                                      const struct net_device *dev,
2678                                      struct netlink_ext_ack *extack)
2679 {
2680         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2681         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2682         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2683         struct rt6_info *grt;
2684         int err;
2685
2686         err = 0;
2687         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2688         if (grt) {
2689                 if (!grt->dst.error &&
2690                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2691                         NL_SET_ERR_MSG(extack,
2692                                        "Nexthop has invalid gateway or device mismatch");
2693                         err = -EINVAL;
2694                 }
2695
2696                 ip6_rt_put(grt);
2697         }
2698
2699         return err;
2700 }
2701
2702 static int ip6_route_check_nh(struct net *net,
2703                               struct fib6_config *cfg,
2704                               struct net_device **_dev,
2705                               struct inet6_dev **idev)
2706 {
2707         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2708         struct net_device *dev = _dev ? *_dev : NULL;
2709         struct rt6_info *grt = NULL;
2710         int err = -EHOSTUNREACH;
2711
2712         if (cfg->fc_table) {
2713                 int flags = RT6_LOOKUP_F_IFACE;
2714
2715                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2716                                           cfg->fc_table, flags);
2717                 if (grt) {
2718                         if (grt->rt6i_flags & RTF_GATEWAY ||
2719                             (dev && dev != grt->dst.dev)) {
2720                                 ip6_rt_put(grt);
2721                                 grt = NULL;
2722                         }
2723                 }
2724         }
2725
2726         if (!grt)
2727                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2728
2729         if (!grt)
2730                 goto out;
2731
2732         if (dev) {
2733                 if (dev != grt->dst.dev) {
2734                         ip6_rt_put(grt);
2735                         goto out;
2736                 }
2737         } else {
2738                 *_dev = dev = grt->dst.dev;
2739                 *idev = grt->rt6i_idev;
2740                 dev_hold(dev);
2741                 in6_dev_hold(grt->rt6i_idev);
2742         }
2743
2744         if (!(grt->rt6i_flags & RTF_GATEWAY))
2745                 err = 0;
2746
2747         ip6_rt_put(grt);
2748
2749 out:
2750         return err;
2751 }
2752
2753 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2754                            struct net_device **_dev, struct inet6_dev **idev,
2755                            struct netlink_ext_ack *extack)
2756 {
2757         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2758         int gwa_type = ipv6_addr_type(gw_addr);
2759         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2760         const struct net_device *dev = *_dev;
2761         bool need_addr_check = !dev;
2762         int err = -EINVAL;
2763
2764         /* if gw_addr is local we will fail to detect this in case
2765          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2766          * will return already-added prefix route via interface that
2767          * prefix route was assigned to, which might be non-loopback.
2768          */
2769         if (dev &&
2770             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2771                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2772                 goto out;
2773         }
2774
2775         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2776                 /* IPv6 strictly inhibits using not link-local
2777                  * addresses as nexthop address.
2778                  * Otherwise, router will not able to send redirects.
2779                  * It is very good, but in some (rare!) circumstances
2780                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2781                  * some exceptions. --ANK
2782                  * We allow IPv4-mapped nexthops to support RFC4798-type
2783                  * addressing
2784                  */
2785                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2786                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2787                         goto out;
2788                 }
2789
2790                 if (cfg->fc_flags & RTNH_F_ONLINK)
2791                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2792                 else
2793                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2794
2795                 if (err)
2796                         goto out;
2797         }
2798
2799         /* reload in case device was changed */
2800         dev = *_dev;
2801
2802         err = -EINVAL;
2803         if (!dev) {
2804                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2805                 goto out;
2806         } else if (dev->flags & IFF_LOOPBACK) {
2807                 NL_SET_ERR_MSG(extack,
2808                                "Egress device can not be loopback device for this route");
2809                 goto out;
2810         }
2811
2812         /* if we did not check gw_addr above, do so now that the
2813          * egress device has been resolved.
2814          */
2815         if (need_addr_check &&
2816             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2817                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2818                 goto out;
2819         }
2820
2821         err = 0;
2822 out:
2823         return err;
2824 }
2825
2826 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2827                                               gfp_t gfp_flags,
2828                                               struct netlink_ext_ack *extack)
2829 {
2830         struct net *net = cfg->fc_nlinfo.nl_net;
2831         struct rt6_info *rt = NULL;
2832         struct net_device *dev = NULL;
2833         struct inet6_dev *idev = NULL;
2834         struct fib6_table *table;
2835         int addr_type;
2836         int err = -EINVAL;
2837
2838         /* RTF_PCPU is an internal flag; can not be set by userspace */
2839         if (cfg->fc_flags & RTF_PCPU) {
2840                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2841                 goto out;
2842         }
2843
2844         /* RTF_CACHE is an internal flag; can not be set by userspace */
2845         if (cfg->fc_flags & RTF_CACHE) {
2846                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2847                 goto out;
2848         }
2849
2850         if (cfg->fc_type > RTN_MAX) {
2851                 NL_SET_ERR_MSG(extack, "Invalid route type");
2852                 goto out;
2853         }
2854
2855         if (cfg->fc_dst_len > 128) {
2856                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2857                 goto out;
2858         }
2859         if (cfg->fc_src_len > 128) {
2860                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2861                 goto out;
2862         }
2863 #ifndef CONFIG_IPV6_SUBTREES
2864         if (cfg->fc_src_len) {
2865                 NL_SET_ERR_MSG(extack,
2866                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2867                 goto out;
2868         }
2869 #endif
2870         if (cfg->fc_ifindex) {
2871                 err = -ENODEV;
2872                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2873                 if (!dev)
2874                         goto out;
2875                 idev = in6_dev_get(dev);
2876                 if (!idev)
2877                         goto out;
2878         }
2879
2880         if (cfg->fc_metric == 0)
2881                 cfg->fc_metric = IP6_RT_PRIO_USER;
2882
2883         if (cfg->fc_flags & RTNH_F_ONLINK) {
2884                 if (!dev) {
2885                         NL_SET_ERR_MSG(extack,
2886                                        "Nexthop device required for onlink");
2887                         err = -ENODEV;
2888                         goto out;
2889                 }
2890
2891                 if (!(dev->flags & IFF_UP)) {
2892                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2893                         err = -ENETDOWN;
2894                         goto out;
2895                 }
2896         }
2897
2898         err = -ENOBUFS;
2899         if (cfg->fc_nlinfo.nlh &&
2900             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2901                 table = fib6_get_table(net, cfg->fc_table);
2902                 if (!table) {
2903                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2904                         table = fib6_new_table(net, cfg->fc_table);
2905                 }
2906         } else {
2907                 table = fib6_new_table(net, cfg->fc_table);
2908         }
2909
2910         if (!table)
2911                 goto out;
2912
2913         err = -ENOMEM;
2914         rt = fib6_info_alloc(gfp_flags);
2915         if (!rt)
2916                 goto out;
2917
2918         if (cfg->fc_flags & RTF_ADDRCONF)
2919                 rt->dst_nocount = true;
2920
2921         err = ip6_convert_metrics(net, rt, cfg);
2922         if (err < 0)
2923                 goto out;
2924
2925         if (cfg->fc_flags & RTF_EXPIRES)
2926                 fib6_set_expires(rt, jiffies +
2927                                 clock_t_to_jiffies(cfg->fc_expires));
2928         else
2929                 fib6_clean_expires(rt);
2930
2931         if (cfg->fc_protocol == RTPROT_UNSPEC)
2932                 cfg->fc_protocol = RTPROT_BOOT;
2933         rt->rt6i_protocol = cfg->fc_protocol;
2934
2935         addr_type = ipv6_addr_type(&cfg->fc_dst);
2936
2937         if (cfg->fc_encap) {
2938                 struct lwtunnel_state *lwtstate;
2939
2940                 err = lwtunnel_build_state(cfg->fc_encap_type,
2941                                            cfg->fc_encap, AF_INET6, cfg,
2942                                            &lwtstate, extack);
2943                 if (err)
2944                         goto out;
2945                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2946         }
2947
2948         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2949         rt->rt6i_dst.plen = cfg->fc_dst_len;
2950         if (rt->rt6i_dst.plen == 128)
2951                 rt->dst_host = true;
2952
2953 #ifdef CONFIG_IPV6_SUBTREES
2954         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2955         rt->rt6i_src.plen = cfg->fc_src_len;
2956 #endif
2957
2958         rt->rt6i_metric = cfg->fc_metric;
2959         rt->fib6_nh.nh_weight = 1;
2960
2961         rt->fib6_type = cfg->fc_type;
2962
2963         /* We cannot add true routes via loopback here,
2964            they would result in kernel looping; promote them to reject routes
2965          */
2966         if ((cfg->fc_flags & RTF_REJECT) ||
2967             (dev && (dev->flags & IFF_LOOPBACK) &&
2968              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2969              !(cfg->fc_flags & RTF_LOCAL))) {
2970                 /* hold loopback dev/idev if we haven't done so. */
2971                 if (dev != net->loopback_dev) {
2972                         if (dev) {
2973                                 dev_put(dev);
2974                                 in6_dev_put(idev);
2975                         }
2976                         dev = net->loopback_dev;
2977                         dev_hold(dev);
2978                         idev = in6_dev_get(dev);
2979                         if (!idev) {
2980                                 err = -ENODEV;
2981                                 goto out;
2982                         }
2983                 }
2984                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2985                 goto install_route;
2986         }
2987
2988         if (cfg->fc_flags & RTF_GATEWAY) {
2989                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2990                 if (err)
2991                         goto out;
2992
2993                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
2994         }
2995
2996         err = -ENODEV;
2997         if (!dev)
2998                 goto out;
2999
3000         if (idev->cnf.disable_ipv6) {
3001                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3002                 err = -EACCES;
3003                 goto out;
3004         }
3005
3006         if (!(dev->flags & IFF_UP)) {
3007                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3008                 err = -ENETDOWN;
3009                 goto out;
3010         }
3011
3012         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3013                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3014                         NL_SET_ERR_MSG(extack, "Invalid source address");
3015                         err = -EINVAL;
3016                         goto out;
3017                 }
3018                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
3019                 rt->rt6i_prefsrc.plen = 128;
3020         } else
3021                 rt->rt6i_prefsrc.plen = 0;
3022
3023         rt->rt6i_flags = cfg->fc_flags;
3024
3025 install_route:
3026         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3027             !netif_carrier_ok(dev))
3028                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3029         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3030         rt->fib6_nh.nh_dev = dev;
3031         rt->rt6i_idev = idev;
3032         rt->rt6i_table = table;
3033
3034         cfg->fc_nlinfo.nl_net = dev_net(dev);
3035
3036         return rt;
3037 out:
3038         if (dev)
3039                 dev_put(dev);
3040         if (idev)
3041                 in6_dev_put(idev);
3042
3043         fib6_info_release(rt);
3044         return ERR_PTR(err);
3045 }
3046
3047 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3048                   struct netlink_ext_ack *extack)
3049 {
3050         struct rt6_info *rt;
3051         int err;
3052
3053         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3054         if (IS_ERR(rt))
3055                 return PTR_ERR(rt);
3056
3057         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3058         fib6_info_release(rt);
3059
3060         return err;
3061 }
3062
3063 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3064 {
3065         struct net *net = info->nl_net;
3066         struct fib6_table *table;
3067         int err;
3068
3069         if (rt == net->ipv6.fib6_null_entry) {
3070                 err = -ENOENT;
3071                 goto out;
3072         }
3073
3074         table = rt->rt6i_table;
3075         spin_lock_bh(&table->tb6_lock);
3076         err = fib6_del(rt, info);
3077         spin_unlock_bh(&table->tb6_lock);
3078
3079 out:
3080         fib6_info_release(rt);
3081         return err;
3082 }
3083
3084 int ip6_del_rt(struct net *net, struct rt6_info *rt)
3085 {
3086         struct nl_info info = { .nl_net = net };
3087
3088         return __ip6_del_rt(rt, &info);
3089 }
3090
3091 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3092 {
3093         struct nl_info *info = &cfg->fc_nlinfo;
3094         struct net *net = info->nl_net;
3095         struct sk_buff *skb = NULL;
3096         struct fib6_table *table;
3097         int err = -ENOENT;
3098
3099         if (rt == net->ipv6.fib6_null_entry)
3100                 goto out_put;
3101         table = rt->rt6i_table;
3102         spin_lock_bh(&table->tb6_lock);
3103
3104         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3105                 struct rt6_info *sibling, *next_sibling;
3106
3107                 /* prefer to send a single notification with all hops */
3108                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3109                 if (skb) {
3110                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3111
3112                         if (rt6_fill_node(net, skb, rt, NULL,
3113                                           NULL, NULL, 0, RTM_DELROUTE,
3114                                           info->portid, seq, 0) < 0) {
3115                                 kfree_skb(skb);
3116                                 skb = NULL;
3117                         } else
3118                                 info->skip_notify = 1;
3119                 }
3120
3121                 list_for_each_entry_safe(sibling, next_sibling,
3122                                          &rt->rt6i_siblings,
3123                                          rt6i_siblings) {
3124                         err = fib6_del(sibling, info);
3125                         if (err)
3126                                 goto out_unlock;
3127                 }
3128         }
3129
3130         err = fib6_del(rt, info);
3131 out_unlock:
3132         spin_unlock_bh(&table->tb6_lock);
3133 out_put:
3134         fib6_info_release(rt);
3135
3136         if (skb) {
3137                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3138                             info->nlh, gfp_any());
3139         }
3140         return err;
3141 }
3142
3143 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3144 {
3145         int rc = -ESRCH;
3146
3147         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3148                 goto out;
3149
3150         if (cfg->fc_flags & RTF_GATEWAY &&
3151             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3152                 goto out;
3153         if (dst_hold_safe(&rt->dst))
3154                 rc = rt6_remove_exception_rt(rt);
3155 out:
3156         return rc;
3157 }
3158
3159 static int ip6_route_del(struct fib6_config *cfg,
3160                          struct netlink_ext_ack *extack)
3161 {
3162         struct rt6_info *rt, *rt_cache;
3163         struct fib6_table *table;
3164         struct fib6_node *fn;
3165         int err = -ESRCH;
3166
3167         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3168         if (!table) {
3169                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3170                 return err;
3171         }
3172
3173         rcu_read_lock();
3174
3175         fn = fib6_locate(&table->tb6_root,
3176                          &cfg->fc_dst, cfg->fc_dst_len,
3177                          &cfg->fc_src, cfg->fc_src_len,
3178                          !(cfg->fc_flags & RTF_CACHE));
3179
3180         if (fn) {
3181                 for_each_fib6_node_rt_rcu(fn) {
3182                         if (cfg->fc_flags & RTF_CACHE) {
3183                                 int rc;
3184
3185                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3186                                                               &cfg->fc_src);
3187                                 if (rt_cache) {
3188                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3189                                         if (rc != -ESRCH)
3190                                                 return rc;
3191                                 }
3192                                 continue;
3193                         }
3194                         if (cfg->fc_ifindex &&
3195                             (!rt->fib6_nh.nh_dev ||
3196                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3197                                 continue;
3198                         if (cfg->fc_flags & RTF_GATEWAY &&
3199                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3200                                 continue;
3201                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3202                                 continue;
3203                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3204                                 continue;
3205                         fib6_info_hold(rt);
3206                         rcu_read_unlock();
3207
3208                         /* if gateway was specified only delete the one hop */
3209                         if (cfg->fc_flags & RTF_GATEWAY)
3210                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3211
3212                         return __ip6_del_rt_siblings(rt, cfg);
3213                 }
3214         }
3215         rcu_read_unlock();
3216
3217         return err;
3218 }
3219
3220 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3221 {
3222         struct netevent_redirect netevent;
3223         struct rt6_info *rt, *nrt = NULL;
3224         struct ndisc_options ndopts;
3225         struct inet6_dev *in6_dev;
3226         struct neighbour *neigh;
3227         struct rd_msg *msg;
3228         int optlen, on_link;
3229         u8 *lladdr;
3230
3231         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3232         optlen -= sizeof(*msg);
3233
3234         if (optlen < 0) {
3235                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3236                 return;
3237         }
3238
3239         msg = (struct rd_msg *)icmp6_hdr(skb);
3240
3241         if (ipv6_addr_is_multicast(&msg->dest)) {
3242                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3243                 return;
3244         }
3245
3246         on_link = 0;
3247         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3248                 on_link = 1;
3249         } else if (ipv6_addr_type(&msg->target) !=
3250                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3251                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3252                 return;
3253         }
3254
3255         in6_dev = __in6_dev_get(skb->dev);
3256         if (!in6_dev)
3257                 return;
3258         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3259                 return;
3260
3261         /* RFC2461 8.1:
3262          *      The IP source address of the Redirect MUST be the same as the current
3263          *      first-hop router for the specified ICMP Destination Address.
3264          */
3265
3266         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3267                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3268                 return;
3269         }
3270
3271         lladdr = NULL;
3272         if (ndopts.nd_opts_tgt_lladdr) {
3273                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3274                                              skb->dev);
3275                 if (!lladdr) {
3276                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3277                         return;
3278                 }
3279         }
3280
3281         rt = (struct rt6_info *) dst;
3282         if (rt->rt6i_flags & RTF_REJECT) {
3283                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3284                 return;
3285         }
3286
3287         /* Redirect received -> path was valid.
3288          * Look, redirects are sent only in response to data packets,
3289          * so that this nexthop apparently is reachable. --ANK
3290          */
3291         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3292
3293         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3294         if (!neigh)
3295                 return;
3296
3297         /*
3298          *      We have finally decided to accept it.
3299          */
3300
3301         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3302                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3303                      NEIGH_UPDATE_F_OVERRIDE|
3304                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3305                                      NEIGH_UPDATE_F_ISROUTER)),
3306                      NDISC_REDIRECT, &ndopts);
3307
3308         nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
3309         if (!nrt)
3310                 goto out;
3311
3312         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3313         if (on_link)
3314                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3315
3316         nrt->rt6i_protocol = RTPROT_REDIRECT;
3317         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3318
3319         /* No need to remove rt from the exception table if rt is
3320          * a cached route because rt6_insert_exception() will
3321          * takes care of it
3322          */
3323         if (rt6_insert_exception(nrt, rt->from)) {
3324                 dst_release_immediate(&nrt->dst);
3325                 goto out;
3326         }
3327
3328         netevent.old = &rt->dst;
3329         netevent.new = &nrt->dst;
3330         netevent.daddr = &msg->dest;
3331         netevent.neigh = neigh;
3332         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3333
3334 out:
3335         neigh_release(neigh);
3336 }
3337
3338 #ifdef CONFIG_IPV6_ROUTE_INFO
3339 static struct rt6_info *rt6_get_route_info(struct net *net,
3340                                            const struct in6_addr *prefix, int prefixlen,
3341                                            const struct in6_addr *gwaddr,
3342                                            struct net_device *dev)
3343 {
3344         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3345         int ifindex = dev->ifindex;
3346         struct fib6_node *fn;
3347         struct rt6_info *rt = NULL;
3348         struct fib6_table *table;
3349
3350         table = fib6_get_table(net, tb_id);
3351         if (!table)
3352                 return NULL;
3353
3354         rcu_read_lock();
3355         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3356         if (!fn)
3357                 goto out;
3358
3359         for_each_fib6_node_rt_rcu(fn) {
3360                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3361                         continue;
3362                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3363                         continue;
3364                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3365                         continue;
3366                 ip6_hold_safe(NULL, &rt, false);
3367                 break;
3368         }
3369 out:
3370         rcu_read_unlock();
3371         return rt;
3372 }
3373
3374 static struct rt6_info *rt6_add_route_info(struct net *net,
3375                                            const struct in6_addr *prefix, int prefixlen,
3376                                            const struct in6_addr *gwaddr,
3377                                            struct net_device *dev,
3378                                            unsigned int pref)
3379 {
3380         struct fib6_config cfg = {
3381                 .fc_metric      = IP6_RT_PRIO_USER,
3382                 .fc_ifindex     = dev->ifindex,
3383                 .fc_dst_len     = prefixlen,
3384                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3385                                   RTF_UP | RTF_PREF(pref),
3386                 .fc_protocol = RTPROT_RA,
3387                 .fc_type = RTN_UNICAST,
3388                 .fc_nlinfo.portid = 0,
3389                 .fc_nlinfo.nlh = NULL,
3390                 .fc_nlinfo.nl_net = net,
3391         };
3392
3393         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3394         cfg.fc_dst = *prefix;
3395         cfg.fc_gateway = *gwaddr;
3396
3397         /* We should treat it as a default route if prefix length is 0. */
3398         if (!prefixlen)
3399                 cfg.fc_flags |= RTF_DEFAULT;
3400
3401         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3402
3403         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3404 }
3405 #endif
3406
3407 struct rt6_info *rt6_get_dflt_router(struct net *net,
3408                                      const struct in6_addr *addr,
3409                                      struct net_device *dev)
3410 {
3411         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3412         struct rt6_info *rt;
3413         struct fib6_table *table;
3414
3415         table = fib6_get_table(net, tb_id);
3416         if (!table)
3417                 return NULL;
3418
3419         rcu_read_lock();
3420         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3421                 if (dev == rt->fib6_nh.nh_dev &&
3422                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3423                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3424                         break;
3425         }
3426         if (rt)
3427                 ip6_hold_safe(NULL, &rt, false);
3428         rcu_read_unlock();
3429         return rt;
3430 }
3431
3432 struct rt6_info *rt6_add_dflt_router(struct net *net,
3433                                      const struct in6_addr *gwaddr,
3434                                      struct net_device *dev,
3435                                      unsigned int pref)
3436 {
3437         struct fib6_config cfg = {
3438                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3439                 .fc_metric      = IP6_RT_PRIO_USER,
3440                 .fc_ifindex     = dev->ifindex,
3441                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3442                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3443                 .fc_protocol = RTPROT_RA,
3444                 .fc_type = RTN_UNICAST,
3445                 .fc_nlinfo.portid = 0,
3446                 .fc_nlinfo.nlh = NULL,
3447                 .fc_nlinfo.nl_net = net,
3448         };
3449
3450         cfg.fc_gateway = *gwaddr;
3451
3452         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3453                 struct fib6_table *table;
3454
3455                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3456                 if (table)
3457                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3458         }
3459
3460         return rt6_get_dflt_router(net, gwaddr, dev);
3461 }
3462
3463 static void __rt6_purge_dflt_routers(struct net *net,
3464                                      struct fib6_table *table)
3465 {
3466         struct rt6_info *rt;
3467
3468 restart:
3469         rcu_read_lock();
3470         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3471                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3472                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3473                         fib6_info_hold(rt);
3474                         rcu_read_unlock();
3475                         ip6_del_rt(net, rt);
3476                         goto restart;
3477                 }
3478         }
3479         rcu_read_unlock();
3480
3481         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3482 }
3483
3484 void rt6_purge_dflt_routers(struct net *net)
3485 {
3486         struct fib6_table *table;
3487         struct hlist_head *head;
3488         unsigned int h;
3489
3490         rcu_read_lock();
3491
3492         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3493                 head = &net->ipv6.fib_table_hash[h];
3494                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3495                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3496                                 __rt6_purge_dflt_routers(net, table);
3497                 }
3498         }
3499
3500         rcu_read_unlock();
3501 }
3502
3503 static void rtmsg_to_fib6_config(struct net *net,
3504                                  struct in6_rtmsg *rtmsg,
3505                                  struct fib6_config *cfg)
3506 {
3507         memset(cfg, 0, sizeof(*cfg));
3508
3509         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3510                          : RT6_TABLE_MAIN;
3511         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3512         cfg->fc_metric = rtmsg->rtmsg_metric;
3513         cfg->fc_expires = rtmsg->rtmsg_info;
3514         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3515         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3516         cfg->fc_flags = rtmsg->rtmsg_flags;
3517         cfg->fc_type = rtmsg->rtmsg_type;
3518
3519         cfg->fc_nlinfo.nl_net = net;
3520
3521         cfg->fc_dst = rtmsg->rtmsg_dst;
3522         cfg->fc_src = rtmsg->rtmsg_src;
3523         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3524 }
3525
3526 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3527 {
3528         struct fib6_config cfg;
3529         struct in6_rtmsg rtmsg;
3530         int err;
3531
3532         switch (cmd) {
3533         case SIOCADDRT:         /* Add a route */
3534         case SIOCDELRT:         /* Delete a route */
3535                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3536                         return -EPERM;
3537                 err = copy_from_user(&rtmsg, arg,
3538                                      sizeof(struct in6_rtmsg));
3539                 if (err)
3540                         return -EFAULT;
3541
3542                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3543
3544                 rtnl_lock();
3545                 switch (cmd) {
3546                 case SIOCADDRT:
3547                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3548                         break;
3549                 case SIOCDELRT:
3550                         err = ip6_route_del(&cfg, NULL);
3551                         break;
3552                 default:
3553                         err = -EINVAL;
3554                 }
3555                 rtnl_unlock();
3556
3557                 return err;
3558         }
3559
3560         return -EINVAL;
3561 }
3562
3563 /*
3564  *      Drop the packet on the floor
3565  */
3566
3567 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3568 {
3569         int type;
3570         struct dst_entry *dst = skb_dst(skb);
3571         switch (ipstats_mib_noroutes) {
3572         case IPSTATS_MIB_INNOROUTES:
3573                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3574                 if (type == IPV6_ADDR_ANY) {
3575                         IP6_INC_STATS(dev_net(dst->dev),
3576                                       __in6_dev_get_safely(skb->dev),
3577                                       IPSTATS_MIB_INADDRERRORS);
3578                         break;
3579                 }
3580                 /* FALLTHROUGH */
3581         case IPSTATS_MIB_OUTNOROUTES:
3582                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3583                               ipstats_mib_noroutes);
3584                 break;
3585         }
3586         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3587         kfree_skb(skb);
3588         return 0;
3589 }
3590
3591 static int ip6_pkt_discard(struct sk_buff *skb)
3592 {
3593         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3594 }
3595
3596 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3597 {
3598         skb->dev = skb_dst(skb)->dev;
3599         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3600 }
3601
3602 static int ip6_pkt_prohibit(struct sk_buff *skb)
3603 {
3604         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3605 }
3606
3607 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3608 {
3609         skb->dev = skb_dst(skb)->dev;
3610         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3611 }
3612
3613 /*
3614  *      Allocate a dst for local (unicast / anycast) address.
3615  */
3616
3617 struct rt6_info *addrconf_dst_alloc(struct net *net,
3618                                     struct inet6_dev *idev,
3619                                     const struct in6_addr *addr,
3620                                     bool anycast, gfp_t gfp_flags)
3621 {
3622         u32 tb_id;
3623         struct net_device *dev = idev->dev;
3624         struct rt6_info *rt;
3625
3626         rt = fib6_info_alloc(gfp_flags);
3627         if (!rt)
3628                 return ERR_PTR(-ENOMEM);
3629
3630         rt->dst_nocount = true;
3631
3632         in6_dev_hold(idev);
3633         rt->rt6i_idev = idev;
3634
3635         rt->dst_host = true;
3636         rt->rt6i_protocol = RTPROT_KERNEL;
3637         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3638         if (anycast) {
3639                 rt->fib6_type = RTN_ANYCAST;
3640                 rt->rt6i_flags |= RTF_ANYCAST;
3641         } else {
3642                 rt->fib6_type = RTN_LOCAL;
3643                 rt->rt6i_flags |= RTF_LOCAL;
3644         }
3645
3646         rt->fib6_nh.nh_gw = *addr;
3647         dev_hold(dev);
3648         rt->fib6_nh.nh_dev = dev;
3649         rt->rt6i_dst.addr = *addr;
3650         rt->rt6i_dst.plen = 128;
3651         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3652         rt->rt6i_table = fib6_get_table(net, tb_id);
3653
3654         return rt;
3655 }
3656
3657 /* remove deleted ip from prefsrc entries */
3658 struct arg_dev_net_ip {
3659         struct net_device *dev;
3660         struct net *net;
3661         struct in6_addr *addr;
3662 };
3663
3664 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3665 {
3666         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3667         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3668         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3669
3670         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3671             rt != net->ipv6.fib6_null_entry &&
3672             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3673                 spin_lock_bh(&rt6_exception_lock);
3674                 /* remove prefsrc entry */
3675                 rt->rt6i_prefsrc.plen = 0;
3676                 /* need to update cache as well */
3677                 rt6_exceptions_remove_prefsrc(rt);
3678                 spin_unlock_bh(&rt6_exception_lock);
3679         }
3680         return 0;
3681 }
3682
3683 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3684 {
3685         struct net *net = dev_net(ifp->idev->dev);
3686         struct arg_dev_net_ip adni = {
3687                 .dev = ifp->idev->dev,
3688                 .net = net,
3689                 .addr = &ifp->addr,
3690         };
3691         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3692 }
3693
3694 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3695
3696 /* Remove routers and update dst entries when gateway turn into host. */
3697 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3698 {
3699         struct in6_addr *gateway = (struct in6_addr *)arg;
3700
3701         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3702             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3703                 return -1;
3704         }
3705
3706         /* Further clean up cached routes in exception table.
3707          * This is needed because cached route may have a different
3708          * gateway than its 'parent' in the case of an ip redirect.
3709          */
3710         rt6_exceptions_clean_tohost(rt, gateway);
3711
3712         return 0;
3713 }
3714
3715 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3716 {
3717         fib6_clean_all(net, fib6_clean_tohost, gateway);
3718 }
3719
3720 struct arg_netdev_event {
3721         const struct net_device *dev;
3722         union {
3723                 unsigned int nh_flags;
3724                 unsigned long event;
3725         };
3726 };
3727
3728 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3729 {
3730         struct rt6_info *iter;
3731         struct fib6_node *fn;
3732
3733         fn = rcu_dereference_protected(rt->rt6i_node,
3734                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3735         iter = rcu_dereference_protected(fn->leaf,
3736                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3737         while (iter) {
3738                 if (iter->rt6i_metric == rt->rt6i_metric &&
3739                     rt6_qualify_for_ecmp(iter))
3740                         return iter;
3741                 iter = rcu_dereference_protected(iter->rt6_next,
3742                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3743         }
3744
3745         return NULL;
3746 }
3747
3748 static bool rt6_is_dead(const struct rt6_info *rt)
3749 {
3750         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3751             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3752              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3753                 return true;
3754
3755         return false;
3756 }
3757
3758 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3759 {
3760         struct rt6_info *iter;
3761         int total = 0;
3762
3763         if (!rt6_is_dead(rt))
3764                 total += rt->fib6_nh.nh_weight;
3765
3766         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3767                 if (!rt6_is_dead(iter))
3768                         total += iter->fib6_nh.nh_weight;
3769         }
3770
3771         return total;
3772 }
3773
3774 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3775 {
3776         int upper_bound = -1;
3777
3778         if (!rt6_is_dead(rt)) {
3779                 *weight += rt->fib6_nh.nh_weight;
3780                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3781                                                     total) - 1;
3782         }
3783         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3784 }
3785
3786 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3787 {
3788         struct rt6_info *iter;
3789         int weight = 0;
3790
3791         rt6_upper_bound_set(rt, &weight, total);
3792
3793         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3794                 rt6_upper_bound_set(iter, &weight, total);
3795 }
3796
3797 void rt6_multipath_rebalance(struct rt6_info *rt)
3798 {
3799         struct rt6_info *first;
3800         int total;
3801
3802         /* In case the entire multipath route was marked for flushing,
3803          * then there is no need to rebalance upon the removal of every
3804          * sibling route.
3805          */
3806         if (!rt->rt6i_nsiblings || rt->should_flush)
3807                 return;
3808
3809         /* During lookup routes are evaluated in order, so we need to
3810          * make sure upper bounds are assigned from the first sibling
3811          * onwards.
3812          */
3813         first = rt6_multipath_first_sibling(rt);
3814         if (WARN_ON_ONCE(!first))
3815                 return;
3816
3817         total = rt6_multipath_total_weight(first);
3818         rt6_multipath_upper_bound_set(first, total);
3819 }
3820
3821 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3822 {
3823         const struct arg_netdev_event *arg = p_arg;
3824         struct net *net = dev_net(arg->dev);
3825
3826         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3827                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3828                 fib6_update_sernum_upto_root(net, rt);
3829                 rt6_multipath_rebalance(rt);
3830         }
3831
3832         return 0;
3833 }
3834
3835 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3836 {
3837         struct arg_netdev_event arg = {
3838                 .dev = dev,
3839                 {
3840                         .nh_flags = nh_flags,
3841                 },
3842         };
3843
3844         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3845                 arg.nh_flags |= RTNH_F_LINKDOWN;
3846
3847         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3848 }
3849
3850 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3851                                    const struct net_device *dev)
3852 {
3853         struct rt6_info *iter;
3854
3855         if (rt->fib6_nh.nh_dev == dev)
3856                 return true;
3857         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3858                 if (iter->fib6_nh.nh_dev == dev)
3859                         return true;
3860
3861         return false;
3862 }
3863
3864 static void rt6_multipath_flush(struct rt6_info *rt)
3865 {
3866         struct rt6_info *iter;
3867
3868         rt->should_flush = 1;
3869         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3870                 iter->should_flush = 1;
3871 }
3872
3873 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3874                                              const struct net_device *down_dev)
3875 {
3876         struct rt6_info *iter;
3877         unsigned int dead = 0;
3878
3879         if (rt->fib6_nh.nh_dev == down_dev ||
3880             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3881                 dead++;
3882         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3883                 if (iter->fib6_nh.nh_dev == down_dev ||
3884                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3885                         dead++;
3886
3887         return dead;
3888 }
3889
3890 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3891                                        const struct net_device *dev,
3892                                        unsigned int nh_flags)
3893 {
3894         struct rt6_info *iter;
3895
3896         if (rt->fib6_nh.nh_dev == dev)
3897                 rt->fib6_nh.nh_flags |= nh_flags;
3898         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3899                 if (iter->fib6_nh.nh_dev == dev)
3900                         iter->fib6_nh.nh_flags |= nh_flags;
3901 }
3902
3903 /* called with write lock held for table with rt */
3904 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3905 {
3906         const struct arg_netdev_event *arg = p_arg;
3907         const struct net_device *dev = arg->dev;
3908         struct net *net = dev_net(dev);
3909
3910         if (rt == net->ipv6.fib6_null_entry)
3911                 return 0;
3912
3913         switch (arg->event) {
3914         case NETDEV_UNREGISTER:
3915                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3916         case NETDEV_DOWN:
3917                 if (rt->should_flush)
3918                         return -1;
3919                 if (!rt->rt6i_nsiblings)
3920                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3921                 if (rt6_multipath_uses_dev(rt, dev)) {
3922                         unsigned int count;
3923
3924                         count = rt6_multipath_dead_count(rt, dev);
3925                         if (rt->rt6i_nsiblings + 1 == count) {
3926                                 rt6_multipath_flush(rt);
3927                                 return -1;
3928                         }
3929                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3930                                                    RTNH_F_LINKDOWN);
3931                         fib6_update_sernum(net, rt);
3932                         rt6_multipath_rebalance(rt);
3933                 }
3934                 return -2;
3935         case NETDEV_CHANGE:
3936                 if (rt->fib6_nh.nh_dev != dev ||
3937                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3938                         break;
3939                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3940                 rt6_multipath_rebalance(rt);
3941                 break;
3942         }
3943
3944         return 0;
3945 }
3946
3947 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3948 {
3949         struct arg_netdev_event arg = {
3950                 .dev = dev,
3951                 {
3952                         .event = event,
3953                 },
3954         };
3955
3956         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3957 }
3958
3959 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3960 {
3961         rt6_sync_down_dev(dev, event);
3962         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3963         neigh_ifdown(&nd_tbl, dev);
3964 }
3965
3966 struct rt6_mtu_change_arg {
3967         struct net_device *dev;
3968         unsigned int mtu;
3969 };
3970
3971 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3972 {
3973         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3974         struct inet6_dev *idev;
3975
3976         /* In IPv6 pmtu discovery is not optional,
3977            so that RTAX_MTU lock cannot disable it.
3978            We still use this lock to block changes
3979            caused by addrconf/ndisc.
3980         */
3981
3982         idev = __in6_dev_get(arg->dev);
3983         if (!idev)
3984                 return 0;
3985
3986         /* For administrative MTU increase, there is no way to discover
3987            IPv6 PMTU increase, so PMTU increase should be updated here.
3988            Since RFC 1981 doesn't include administrative MTU increase
3989            update PMTU increase is a MUST. (i.e. jumbo frame)
3990          */
3991         if (rt->fib6_nh.nh_dev == arg->dev &&
3992             !fib6_metric_locked(rt, RTAX_MTU)) {
3993                 u32 mtu = rt->fib6_pmtu;
3994
3995                 if (mtu >= arg->mtu ||
3996                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
3997                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
3998
3999                 spin_lock_bh(&rt6_exception_lock);
4000                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4001                 spin_unlock_bh(&rt6_exception_lock);
4002         }
4003         return 0;
4004 }
4005
4006 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4007 {
4008         struct rt6_mtu_change_arg arg = {
4009                 .dev = dev,
4010                 .mtu = mtu,
4011         };
4012
4013         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4014 }
4015
4016 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4017         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4018         [RTA_OIF]               = { .type = NLA_U32 },
4019         [RTA_IIF]               = { .type = NLA_U32 },
4020         [RTA_PRIORITY]          = { .type = NLA_U32 },
4021         [RTA_METRICS]           = { .type = NLA_NESTED },
4022         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4023         [RTA_PREF]              = { .type = NLA_U8 },
4024         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4025         [RTA_ENCAP]             = { .type = NLA_NESTED },
4026         [RTA_EXPIRES]           = { .type = NLA_U32 },
4027         [RTA_UID]               = { .type = NLA_U32 },
4028         [RTA_MARK]              = { .type = NLA_U32 },
4029 };
4030
4031 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4032                               struct fib6_config *cfg,
4033                               struct netlink_ext_ack *extack)
4034 {
4035         struct rtmsg *rtm;
4036         struct nlattr *tb[RTA_MAX+1];
4037         unsigned int pref;
4038         int err;
4039
4040         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4041                           NULL);
4042         if (err < 0)
4043                 goto errout;
4044
4045         err = -EINVAL;
4046         rtm = nlmsg_data(nlh);
4047         memset(cfg, 0, sizeof(*cfg));
4048
4049         cfg->fc_table = rtm->rtm_table;
4050         cfg->fc_dst_len = rtm->rtm_dst_len;
4051         cfg->fc_src_len = rtm->rtm_src_len;
4052         cfg->fc_flags = RTF_UP;
4053         cfg->fc_protocol = rtm->rtm_protocol;
4054         cfg->fc_type = rtm->rtm_type;
4055
4056         if (rtm->rtm_type == RTN_UNREACHABLE ||
4057             rtm->rtm_type == RTN_BLACKHOLE ||
4058             rtm->rtm_type == RTN_PROHIBIT ||
4059             rtm->rtm_type == RTN_THROW)
4060                 cfg->fc_flags |= RTF_REJECT;
4061
4062         if (rtm->rtm_type == RTN_LOCAL)
4063                 cfg->fc_flags |= RTF_LOCAL;
4064
4065         if (rtm->rtm_flags & RTM_F_CLONED)
4066                 cfg->fc_flags |= RTF_CACHE;
4067
4068         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4069
4070         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4071         cfg->fc_nlinfo.nlh = nlh;
4072         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4073
4074         if (tb[RTA_GATEWAY]) {
4075                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4076                 cfg->fc_flags |= RTF_GATEWAY;
4077         }
4078
4079         if (tb[RTA_DST]) {
4080                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4081
4082                 if (nla_len(tb[RTA_DST]) < plen)
4083                         goto errout;
4084
4085                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4086         }
4087
4088         if (tb[RTA_SRC]) {
4089                 int plen = (rtm->rtm_src_len + 7) >> 3;
4090
4091                 if (nla_len(tb[RTA_SRC]) < plen)
4092                         goto errout;
4093
4094                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4095         }
4096
4097         if (tb[RTA_PREFSRC])
4098                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4099
4100         if (tb[RTA_OIF])
4101                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4102
4103         if (tb[RTA_PRIORITY])
4104                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4105
4106         if (tb[RTA_METRICS]) {
4107                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4108                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4109         }
4110
4111         if (tb[RTA_TABLE])
4112                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4113
4114         if (tb[RTA_MULTIPATH]) {
4115                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4116                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4117
4118                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4119                                                      cfg->fc_mp_len, extack);
4120                 if (err < 0)
4121                         goto errout;
4122         }
4123
4124         if (tb[RTA_PREF]) {
4125                 pref = nla_get_u8(tb[RTA_PREF]);
4126                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4127                     pref != ICMPV6_ROUTER_PREF_HIGH)
4128                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4129                 cfg->fc_flags |= RTF_PREF(pref);
4130         }
4131
4132         if (tb[RTA_ENCAP])
4133                 cfg->fc_encap = tb[RTA_ENCAP];
4134
4135         if (tb[RTA_ENCAP_TYPE]) {
4136                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4137
4138                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4139                 if (err < 0)
4140                         goto errout;
4141         }
4142
4143         if (tb[RTA_EXPIRES]) {
4144                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4145
4146                 if (addrconf_finite_timeout(timeout)) {
4147                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4148                         cfg->fc_flags |= RTF_EXPIRES;
4149                 }
4150         }
4151
4152         err = 0;
4153 errout:
4154         return err;
4155 }
4156
4157 struct rt6_nh {
4158         struct rt6_info *rt6_info;
4159         struct fib6_config r_cfg;
4160         struct list_head next;
4161 };
4162
4163 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4164 {
4165         struct rt6_nh *nh;
4166
4167         list_for_each_entry(nh, rt6_nh_list, next) {
4168                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4169                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4170                         nh->r_cfg.fc_ifindex);
4171         }
4172 }
4173
4174 static int ip6_route_info_append(struct net *net,
4175                                  struct list_head *rt6_nh_list,
4176                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4177 {
4178         struct rt6_nh *nh;
4179         int err = -EEXIST;
4180
4181         list_for_each_entry(nh, rt6_nh_list, next) {
4182                 /* check if rt6_info already exists */
4183                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4184                         return err;
4185         }
4186
4187         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4188         if (!nh)
4189                 return -ENOMEM;
4190         nh->rt6_info = rt;
4191         err = ip6_convert_metrics(net, rt, r_cfg);
4192         if (err) {
4193                 kfree(nh);
4194                 return err;
4195         }
4196         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4197         list_add_tail(&nh->next, rt6_nh_list);
4198
4199         return 0;
4200 }
4201
4202 static void ip6_route_mpath_notify(struct rt6_info *rt,
4203                                    struct rt6_info *rt_last,
4204                                    struct nl_info *info,
4205                                    __u16 nlflags)
4206 {
4207         /* if this is an APPEND route, then rt points to the first route
4208          * inserted and rt_last points to last route inserted. Userspace
4209          * wants a consistent dump of the route which starts at the first
4210          * nexthop. Since sibling routes are always added at the end of
4211          * the list, find the first sibling of the last route appended
4212          */
4213         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4214                 rt = list_first_entry(&rt_last->rt6i_siblings,
4215                                       struct rt6_info,
4216                                       rt6i_siblings);
4217         }
4218
4219         if (rt)
4220                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4221 }
4222
4223 static int ip6_route_multipath_add(struct fib6_config *cfg,
4224                                    struct netlink_ext_ack *extack)
4225 {
4226         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4227         struct nl_info *info = &cfg->fc_nlinfo;
4228         struct fib6_config r_cfg;
4229         struct rtnexthop *rtnh;
4230         struct rt6_info *rt;
4231         struct rt6_nh *err_nh;
4232         struct rt6_nh *nh, *nh_safe;
4233         __u16 nlflags;
4234         int remaining;
4235         int attrlen;
4236         int err = 1;
4237         int nhn = 0;
4238         int replace = (cfg->fc_nlinfo.nlh &&
4239                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4240         LIST_HEAD(rt6_nh_list);
4241
4242         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4243         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4244                 nlflags |= NLM_F_APPEND;
4245
4246         remaining = cfg->fc_mp_len;
4247         rtnh = (struct rtnexthop *)cfg->fc_mp;
4248
4249         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4250          * rt6_info structs per nexthop
4251          */
4252         while (rtnh_ok(rtnh, remaining)) {
4253                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4254                 if (rtnh->rtnh_ifindex)
4255                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4256
4257                 attrlen = rtnh_attrlen(rtnh);
4258                 if (attrlen > 0) {
4259                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4260
4261                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4262                         if (nla) {
4263                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4264                                 r_cfg.fc_flags |= RTF_GATEWAY;
4265                         }
4266                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4267                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4268                         if (nla)
4269                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4270                 }
4271
4272                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4273                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4274                 if (IS_ERR(rt)) {
4275                         err = PTR_ERR(rt);
4276                         rt = NULL;
4277                         goto cleanup;
4278                 }
4279
4280                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4281
4282                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4283                                             rt, &r_cfg);
4284                 if (err) {
4285                         fib6_info_release(rt);
4286                         goto cleanup;
4287                 }
4288
4289                 rtnh = rtnh_next(rtnh, &remaining);
4290         }
4291
4292         /* for add and replace send one notification with all nexthops.
4293          * Skip the notification in fib6_add_rt2node and send one with
4294          * the full route when done
4295          */
4296         info->skip_notify = 1;
4297
4298         err_nh = NULL;
4299         list_for_each_entry(nh, &rt6_nh_list, next) {
4300                 rt_last = nh->rt6_info;
4301                 err = __ip6_ins_rt(nh->rt6_info, info, extack);
4302                 fib6_info_release(nh->rt6_info);
4303
4304                 /* save reference to first route for notification */
4305                 if (!rt_notif && !err)
4306                         rt_notif = nh->rt6_info;
4307
4308                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4309                 nh->rt6_info = NULL;
4310                 if (err) {
4311                         if (replace && nhn)
4312                                 ip6_print_replace_route_err(&rt6_nh_list);
4313                         err_nh = nh;
4314                         goto add_errout;
4315                 }
4316
4317                 /* Because each route is added like a single route we remove
4318                  * these flags after the first nexthop: if there is a collision,
4319                  * we have already failed to add the first nexthop:
4320                  * fib6_add_rt2node() has rejected it; when replacing, old
4321                  * nexthops have been replaced by first new, the rest should
4322                  * be added to it.
4323                  */
4324                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4325                                                      NLM_F_REPLACE);
4326                 nhn++;
4327         }
4328
4329         /* success ... tell user about new route */
4330         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4331         goto cleanup;
4332
4333 add_errout:
4334         /* send notification for routes that were added so that
4335          * the delete notifications sent by ip6_route_del are
4336          * coherent
4337          */
4338         if (rt_notif)
4339                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4340
4341         /* Delete routes that were already added */
4342         list_for_each_entry(nh, &rt6_nh_list, next) {
4343                 if (err_nh == nh)
4344                         break;
4345                 ip6_route_del(&nh->r_cfg, extack);
4346         }
4347
4348 cleanup:
4349         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4350                 if (nh->rt6_info)
4351                         fib6_info_release(nh->rt6_info);
4352                 list_del(&nh->next);
4353                 kfree(nh);
4354         }
4355
4356         return err;
4357 }
4358
4359 static int ip6_route_multipath_del(struct fib6_config *cfg,
4360                                    struct netlink_ext_ack *extack)
4361 {
4362         struct fib6_config r_cfg;
4363         struct rtnexthop *rtnh;
4364         int remaining;
4365         int attrlen;
4366         int err = 1, last_err = 0;
4367
4368         remaining = cfg->fc_mp_len;
4369         rtnh = (struct rtnexthop *)cfg->fc_mp;
4370
4371         /* Parse a Multipath Entry */
4372         while (rtnh_ok(rtnh, remaining)) {
4373                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4374                 if (rtnh->rtnh_ifindex)
4375                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4376
4377                 attrlen = rtnh_attrlen(rtnh);
4378                 if (attrlen > 0) {
4379                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4380
4381                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4382                         if (nla) {
4383                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4384                                 r_cfg.fc_flags |= RTF_GATEWAY;
4385                         }
4386                 }
4387                 err = ip6_route_del(&r_cfg, extack);
4388                 if (err)
4389                         last_err = err;
4390
4391                 rtnh = rtnh_next(rtnh, &remaining);
4392         }
4393
4394         return last_err;
4395 }
4396
4397 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4398                               struct netlink_ext_ack *extack)
4399 {
4400         struct fib6_config cfg;
4401         int err;
4402
4403         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4404         if (err < 0)
4405                 return err;
4406
4407         if (cfg.fc_mp)
4408                 return ip6_route_multipath_del(&cfg, extack);
4409         else {
4410                 cfg.fc_delete_all_nh = 1;
4411                 return ip6_route_del(&cfg, extack);
4412         }
4413 }
4414
4415 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4416                               struct netlink_ext_ack *extack)
4417 {
4418         struct fib6_config cfg;
4419         int err;
4420
4421         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4422         if (err < 0)
4423                 return err;
4424
4425         if (cfg.fc_mp)
4426                 return ip6_route_multipath_add(&cfg, extack);
4427         else
4428                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4429 }
4430
4431 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4432 {
4433         int nexthop_len = 0;
4434
4435         if (rt->rt6i_nsiblings) {
4436                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4437                             + NLA_ALIGN(sizeof(struct rtnexthop))
4438                             + nla_total_size(16) /* RTA_GATEWAY */
4439                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4440
4441                 nexthop_len *= rt->rt6i_nsiblings;
4442         }
4443
4444         return NLMSG_ALIGN(sizeof(struct rtmsg))
4445                + nla_total_size(16) /* RTA_SRC */
4446                + nla_total_size(16) /* RTA_DST */
4447                + nla_total_size(16) /* RTA_GATEWAY */
4448                + nla_total_size(16) /* RTA_PREFSRC */
4449                + nla_total_size(4) /* RTA_TABLE */
4450                + nla_total_size(4) /* RTA_IIF */
4451                + nla_total_size(4) /* RTA_OIF */
4452                + nla_total_size(4) /* RTA_PRIORITY */
4453                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4454                + nla_total_size(sizeof(struct rta_cacheinfo))
4455                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4456                + nla_total_size(1) /* RTA_PREF */
4457                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4458                + nexthop_len;
4459 }
4460
4461 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4462                             unsigned int *flags, bool skip_oif)
4463 {
4464         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4465                 *flags |= RTNH_F_DEAD;
4466
4467         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4468                 *flags |= RTNH_F_LINKDOWN;
4469                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4470                         *flags |= RTNH_F_DEAD;
4471         }
4472
4473         if (rt->rt6i_flags & RTF_GATEWAY) {
4474                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4475                         goto nla_put_failure;
4476         }
4477
4478         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4479         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4480                 *flags |= RTNH_F_OFFLOAD;
4481
4482         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4483         if (!skip_oif && rt->fib6_nh.nh_dev &&
4484             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4485                 goto nla_put_failure;
4486
4487         if (rt->fib6_nh.nh_lwtstate &&
4488             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4489                 goto nla_put_failure;
4490
4491         return 0;
4492
4493 nla_put_failure:
4494         return -EMSGSIZE;
4495 }
4496
4497 /* add multipath next hop */
4498 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4499 {
4500         const struct net_device *dev = rt->fib6_nh.nh_dev;
4501         struct rtnexthop *rtnh;
4502         unsigned int flags = 0;
4503
4504         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4505         if (!rtnh)
4506                 goto nla_put_failure;
4507
4508         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4509         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4510
4511         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4512                 goto nla_put_failure;
4513
4514         rtnh->rtnh_flags = flags;
4515
4516         /* length of rtnetlink header + attributes */
4517         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4518
4519         return 0;
4520
4521 nla_put_failure:
4522         return -EMSGSIZE;
4523 }
4524
4525 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4526                          struct rt6_info *rt, struct dst_entry *dst,
4527                          struct in6_addr *dest, struct in6_addr *src,
4528                          int iif, int type, u32 portid, u32 seq,
4529                          unsigned int flags)
4530 {
4531         struct rtmsg *rtm;
4532         struct nlmsghdr *nlh;
4533         long expires = 0;
4534         u32 *pmetrics;
4535         u32 table;
4536
4537         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4538         if (!nlh)
4539                 return -EMSGSIZE;
4540
4541         rtm = nlmsg_data(nlh);
4542         rtm->rtm_family = AF_INET6;
4543         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4544         rtm->rtm_src_len = rt->rt6i_src.plen;
4545         rtm->rtm_tos = 0;
4546         if (rt->rt6i_table)
4547                 table = rt->rt6i_table->tb6_id;
4548         else
4549                 table = RT6_TABLE_UNSPEC;
4550         rtm->rtm_table = table;
4551         if (nla_put_u32(skb, RTA_TABLE, table))
4552                 goto nla_put_failure;
4553
4554         rtm->rtm_type = rt->fib6_type;
4555         rtm->rtm_flags = 0;
4556         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4557         rtm->rtm_protocol = rt->rt6i_protocol;
4558
4559         if (rt->rt6i_flags & RTF_CACHE)
4560                 rtm->rtm_flags |= RTM_F_CLONED;
4561
4562         if (dest) {
4563                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4564                         goto nla_put_failure;
4565                 rtm->rtm_dst_len = 128;
4566         } else if (rtm->rtm_dst_len)
4567                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4568                         goto nla_put_failure;
4569 #ifdef CONFIG_IPV6_SUBTREES
4570         if (src) {
4571                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4572                         goto nla_put_failure;
4573                 rtm->rtm_src_len = 128;
4574         } else if (rtm->rtm_src_len &&
4575                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4576                 goto nla_put_failure;
4577 #endif
4578         if (iif) {
4579 #ifdef CONFIG_IPV6_MROUTE
4580                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4581                         int err = ip6mr_get_route(net, skb, rtm, portid);
4582
4583                         if (err == 0)
4584                                 return 0;
4585                         if (err < 0)
4586                                 goto nla_put_failure;
4587                 } else
4588 #endif
4589                         if (nla_put_u32(skb, RTA_IIF, iif))
4590                                 goto nla_put_failure;
4591         } else if (dest) {
4592                 struct in6_addr saddr_buf;
4593                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4594                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4595                         goto nla_put_failure;
4596         }
4597
4598         if (rt->rt6i_prefsrc.plen) {
4599                 struct in6_addr saddr_buf;
4600                 saddr_buf = rt->rt6i_prefsrc.addr;
4601                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4602                         goto nla_put_failure;
4603         }
4604
4605         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4606         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4607                 goto nla_put_failure;
4608
4609         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4610                 goto nla_put_failure;
4611
4612         /* For multipath routes, walk the siblings list and add
4613          * each as a nexthop within RTA_MULTIPATH.
4614          */
4615         if (rt->rt6i_nsiblings) {
4616                 struct rt6_info *sibling, *next_sibling;
4617                 struct nlattr *mp;
4618
4619                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4620                 if (!mp)
4621                         goto nla_put_failure;
4622
4623                 if (rt6_add_nexthop(skb, rt) < 0)
4624                         goto nla_put_failure;
4625
4626                 list_for_each_entry_safe(sibling, next_sibling,
4627                                          &rt->rt6i_siblings, rt6i_siblings) {
4628                         if (rt6_add_nexthop(skb, sibling) < 0)
4629                                 goto nla_put_failure;
4630                 }
4631
4632                 nla_nest_end(skb, mp);
4633         } else {
4634                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4635                         goto nla_put_failure;
4636         }
4637
4638         if (rt->rt6i_flags & RTF_EXPIRES) {
4639                 expires = dst ? dst->expires : rt->expires;
4640                 expires -= jiffies;
4641         }
4642
4643         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4644                 goto nla_put_failure;
4645
4646         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4647                 goto nla_put_failure;
4648
4649
4650         nlmsg_end(skb, nlh);
4651         return 0;
4652
4653 nla_put_failure:
4654         nlmsg_cancel(skb, nlh);
4655         return -EMSGSIZE;
4656 }
4657
4658 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4659 {
4660         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4661         struct net *net = arg->net;
4662
4663         if (rt == net->ipv6.fib6_null_entry)
4664                 return 0;
4665
4666         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4667                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4668
4669                 /* user wants prefix routes only */
4670                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4671                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4672                         /* success since this is not a prefix route */
4673                         return 1;
4674                 }
4675         }
4676
4677         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4678                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4679                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4680 }
4681
4682 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4683                               struct netlink_ext_ack *extack)
4684 {
4685         struct net *net = sock_net(in_skb->sk);
4686         struct nlattr *tb[RTA_MAX+1];
4687         int err, iif = 0, oif = 0;
4688         struct dst_entry *dst;
4689         struct rt6_info *rt;
4690         struct sk_buff *skb;
4691         struct rtmsg *rtm;
4692         struct flowi6 fl6;
4693         bool fibmatch;
4694
4695         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4696                           extack);
4697         if (err < 0)
4698                 goto errout;
4699
4700         err = -EINVAL;
4701         memset(&fl6, 0, sizeof(fl6));
4702         rtm = nlmsg_data(nlh);
4703         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4704         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4705
4706         if (tb[RTA_SRC]) {
4707                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4708                         goto errout;
4709
4710                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4711         }
4712
4713         if (tb[RTA_DST]) {
4714                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4715                         goto errout;
4716
4717                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4718         }
4719
4720         if (tb[RTA_IIF])
4721                 iif = nla_get_u32(tb[RTA_IIF]);
4722
4723         if (tb[RTA_OIF])
4724                 oif = nla_get_u32(tb[RTA_OIF]);
4725
4726         if (tb[RTA_MARK])
4727                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4728
4729         if (tb[RTA_UID])
4730                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4731                                            nla_get_u32(tb[RTA_UID]));
4732         else
4733                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4734
4735         if (iif) {
4736                 struct net_device *dev;
4737                 int flags = 0;
4738
4739                 rcu_read_lock();
4740
4741                 dev = dev_get_by_index_rcu(net, iif);
4742                 if (!dev) {
4743                         rcu_read_unlock();
4744                         err = -ENODEV;
4745                         goto errout;
4746                 }
4747
4748                 fl6.flowi6_iif = iif;
4749
4750                 if (!ipv6_addr_any(&fl6.saddr))
4751                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4752
4753                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4754
4755                 rcu_read_unlock();
4756         } else {
4757                 fl6.flowi6_oif = oif;
4758
4759                 dst = ip6_route_output(net, NULL, &fl6);
4760         }
4761
4762
4763         rt = container_of(dst, struct rt6_info, dst);
4764         if (rt->dst.error) {
4765                 err = rt->dst.error;
4766                 ip6_rt_put(rt);
4767                 goto errout;
4768         }
4769
4770         if (rt == net->ipv6.ip6_null_entry) {
4771                 err = rt->dst.error;
4772                 ip6_rt_put(rt);
4773                 goto errout;
4774         }
4775
4776         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4777         if (!skb) {
4778                 ip6_rt_put(rt);
4779                 err = -ENOBUFS;
4780                 goto errout;
4781         }
4782
4783         skb_dst_set(skb, &rt->dst);
4784         if (fibmatch)
4785                 err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
4786                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4787                                     nlh->nlmsg_seq, 0);
4788         else
4789                 err = rt6_fill_node(net, skb, rt->from, dst,
4790                                     &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
4791                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4792                                     0);
4793         if (err < 0) {
4794                 kfree_skb(skb);
4795                 goto errout;
4796         }
4797
4798         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4799 errout:
4800         return err;
4801 }
4802
4803 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4804                      unsigned int nlm_flags)
4805 {
4806         struct sk_buff *skb;
4807         struct net *net = info->nl_net;
4808         u32 seq;
4809         int err;
4810
4811         err = -ENOBUFS;
4812         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4813
4814         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4815         if (!skb)
4816                 goto errout;
4817
4818         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4819                             event, info->portid, seq, nlm_flags);
4820         if (err < 0) {
4821                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4822                 WARN_ON(err == -EMSGSIZE);
4823                 kfree_skb(skb);
4824                 goto errout;
4825         }
4826         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4827                     info->nlh, gfp_any());
4828         return;
4829 errout:
4830         if (err < 0)
4831                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4832 }
4833
4834 static int ip6_route_dev_notify(struct notifier_block *this,
4835                                 unsigned long event, void *ptr)
4836 {
4837         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4838         struct net *net = dev_net(dev);
4839
4840         if (!(dev->flags & IFF_LOOPBACK))
4841                 return NOTIFY_OK;
4842
4843         if (event == NETDEV_REGISTER) {
4844                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4845                 net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
4846                 net->ipv6.ip6_null_entry->dst.dev = dev;
4847                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4848 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4849                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4850                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4851                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4852                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4853 #endif
4854          } else if (event == NETDEV_UNREGISTER &&
4855                     dev->reg_state != NETREG_UNREGISTERED) {
4856                 /* NETDEV_UNREGISTER could be fired for multiple times by
4857                  * netdev_wait_allrefs(). Make sure we only call this once.
4858                  */
4859                 in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
4860                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4861 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4862                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4863                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4864 #endif
4865         }
4866
4867         return NOTIFY_OK;
4868 }
4869
4870 /*
4871  *      /proc
4872  */
4873
4874 #ifdef CONFIG_PROC_FS
4875
4876 static const struct file_operations ipv6_route_proc_fops = {
4877         .open           = ipv6_route_open,
4878         .read           = seq_read,
4879         .llseek         = seq_lseek,
4880         .release        = seq_release_net,
4881 };
4882
4883 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4884 {
4885         struct net *net = (struct net *)seq->private;
4886         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4887                    net->ipv6.rt6_stats->fib_nodes,
4888                    net->ipv6.rt6_stats->fib_route_nodes,
4889                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4890                    net->ipv6.rt6_stats->fib_rt_entries,
4891                    net->ipv6.rt6_stats->fib_rt_cache,
4892                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4893                    net->ipv6.rt6_stats->fib_discarded_routes);
4894
4895         return 0;
4896 }
4897
4898 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4899 {
4900         return single_open_net(inode, file, rt6_stats_seq_show);
4901 }
4902
4903 static const struct file_operations rt6_stats_seq_fops = {
4904         .open    = rt6_stats_seq_open,
4905         .read    = seq_read,
4906         .llseek  = seq_lseek,
4907         .release = single_release_net,
4908 };
4909 #endif  /* CONFIG_PROC_FS */
4910
4911 #ifdef CONFIG_SYSCTL
4912
4913 static
4914 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4915                               void __user *buffer, size_t *lenp, loff_t *ppos)
4916 {
4917         struct net *net;
4918         int delay;
4919         if (!write)
4920                 return -EINVAL;
4921
4922         net = (struct net *)ctl->extra1;
4923         delay = net->ipv6.sysctl.flush_delay;
4924         proc_dointvec(ctl, write, buffer, lenp, ppos);
4925         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4926         return 0;
4927 }
4928
4929 struct ctl_table ipv6_route_table_template[] = {
4930         {
4931                 .procname       =       "flush",
4932                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4933                 .maxlen         =       sizeof(int),
4934                 .mode           =       0200,
4935                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4936         },
4937         {
4938                 .procname       =       "gc_thresh",
4939                 .data           =       &ip6_dst_ops_template.gc_thresh,
4940                 .maxlen         =       sizeof(int),
4941                 .mode           =       0644,
4942                 .proc_handler   =       proc_dointvec,
4943         },
4944         {
4945                 .procname       =       "max_size",
4946                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4947                 .maxlen         =       sizeof(int),
4948                 .mode           =       0644,
4949                 .proc_handler   =       proc_dointvec,
4950         },
4951         {
4952                 .procname       =       "gc_min_interval",
4953                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4954                 .maxlen         =       sizeof(int),
4955                 .mode           =       0644,
4956                 .proc_handler   =       proc_dointvec_jiffies,
4957         },
4958         {
4959                 .procname       =       "gc_timeout",
4960                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4961                 .maxlen         =       sizeof(int),
4962                 .mode           =       0644,
4963                 .proc_handler   =       proc_dointvec_jiffies,
4964         },
4965         {
4966                 .procname       =       "gc_interval",
4967                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4968                 .maxlen         =       sizeof(int),
4969                 .mode           =       0644,
4970                 .proc_handler   =       proc_dointvec_jiffies,
4971         },
4972         {
4973                 .procname       =       "gc_elasticity",
4974                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4975                 .maxlen         =       sizeof(int),
4976                 .mode           =       0644,
4977                 .proc_handler   =       proc_dointvec,
4978         },
4979         {
4980                 .procname       =       "mtu_expires",
4981                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4982                 .maxlen         =       sizeof(int),
4983                 .mode           =       0644,
4984                 .proc_handler   =       proc_dointvec_jiffies,
4985         },
4986         {
4987                 .procname       =       "min_adv_mss",
4988                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4989                 .maxlen         =       sizeof(int),
4990                 .mode           =       0644,
4991                 .proc_handler   =       proc_dointvec,
4992         },
4993         {
4994                 .procname       =       "gc_min_interval_ms",
4995                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4996                 .maxlen         =       sizeof(int),
4997                 .mode           =       0644,
4998                 .proc_handler   =       proc_dointvec_ms_jiffies,
4999         },
5000         { }
5001 };
5002
5003 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5004 {
5005         struct ctl_table *table;
5006
5007         table = kmemdup(ipv6_route_table_template,
5008                         sizeof(ipv6_route_table_template),
5009                         GFP_KERNEL);
5010
5011         if (table) {
5012                 table[0].data = &net->ipv6.sysctl.flush_delay;
5013                 table[0].extra1 = net;
5014                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5015                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5016                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5017                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5018                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5019                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5020                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5021                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5022                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5023
5024                 /* Don't export sysctls to unprivileged users */
5025                 if (net->user_ns != &init_user_ns)
5026                         table[0].procname = NULL;
5027         }
5028
5029         return table;
5030 }
5031 #endif
5032
5033 static int __net_init ip6_route_net_init(struct net *net)
5034 {
5035         int ret = -ENOMEM;
5036
5037         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5038                sizeof(net->ipv6.ip6_dst_ops));
5039
5040         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5041                 goto out_ip6_dst_ops;
5042
5043         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5044                                             sizeof(*net->ipv6.fib6_null_entry),
5045                                             GFP_KERNEL);
5046         if (!net->ipv6.fib6_null_entry)
5047                 goto out_ip6_dst_entries;
5048
5049         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5050                                            sizeof(*net->ipv6.ip6_null_entry),
5051                                            GFP_KERNEL);
5052         if (!net->ipv6.ip6_null_entry)
5053                 goto out_fib6_null_entry;
5054         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5055         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5056                          ip6_template_metrics, true);
5057
5058 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5059         net->ipv6.fib6_has_custom_rules = false;
5060         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5061                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5062                                                GFP_KERNEL);
5063         if (!net->ipv6.ip6_prohibit_entry)
5064                 goto out_ip6_null_entry;
5065         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5066         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5067                          ip6_template_metrics, true);
5068
5069         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5070                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5071                                                GFP_KERNEL);
5072         if (!net->ipv6.ip6_blk_hole_entry)
5073                 goto out_ip6_prohibit_entry;
5074         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5075         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5076                          ip6_template_metrics, true);
5077 #endif
5078
5079         net->ipv6.sysctl.flush_delay = 0;
5080         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5081         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5082         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5083         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5084         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5085         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5086         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5087
5088         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5089
5090         ret = 0;
5091 out:
5092         return ret;
5093
5094 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5095 out_ip6_prohibit_entry:
5096         kfree(net->ipv6.ip6_prohibit_entry);
5097 out_ip6_null_entry:
5098         kfree(net->ipv6.ip6_null_entry);
5099 #endif
5100 out_fib6_null_entry:
5101         kfree(net->ipv6.fib6_null_entry);
5102 out_ip6_dst_entries:
5103         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5104 out_ip6_dst_ops:
5105         goto out;
5106 }
5107
5108 static void __net_exit ip6_route_net_exit(struct net *net)
5109 {
5110         kfree(net->ipv6.fib6_null_entry);
5111         kfree(net->ipv6.ip6_null_entry);
5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113         kfree(net->ipv6.ip6_prohibit_entry);
5114         kfree(net->ipv6.ip6_blk_hole_entry);
5115 #endif
5116         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5117 }
5118
5119 static int __net_init ip6_route_net_init_late(struct net *net)
5120 {
5121 #ifdef CONFIG_PROC_FS
5122         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5123         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5124 #endif
5125         return 0;
5126 }
5127
5128 static void __net_exit ip6_route_net_exit_late(struct net *net)
5129 {
5130 #ifdef CONFIG_PROC_FS
5131         remove_proc_entry("ipv6_route", net->proc_net);
5132         remove_proc_entry("rt6_stats", net->proc_net);
5133 #endif
5134 }
5135
5136 static struct pernet_operations ip6_route_net_ops = {
5137         .init = ip6_route_net_init,
5138         .exit = ip6_route_net_exit,
5139 };
5140
5141 static int __net_init ipv6_inetpeer_init(struct net *net)
5142 {
5143         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5144
5145         if (!bp)
5146                 return -ENOMEM;
5147         inet_peer_base_init(bp);
5148         net->ipv6.peers = bp;
5149         return 0;
5150 }
5151
5152 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5153 {
5154         struct inet_peer_base *bp = net->ipv6.peers;
5155
5156         net->ipv6.peers = NULL;
5157         inetpeer_invalidate_tree(bp);
5158         kfree(bp);
5159 }
5160
5161 static struct pernet_operations ipv6_inetpeer_ops = {
5162         .init   =       ipv6_inetpeer_init,
5163         .exit   =       ipv6_inetpeer_exit,
5164 };
5165
5166 static struct pernet_operations ip6_route_net_late_ops = {
5167         .init = ip6_route_net_init_late,
5168         .exit = ip6_route_net_exit_late,
5169 };
5170
5171 static struct notifier_block ip6_route_dev_notifier = {
5172         .notifier_call = ip6_route_dev_notify,
5173         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5174 };
5175
5176 void __init ip6_route_init_special_entries(void)
5177 {
5178         /* Registering of the loopback is done before this portion of code,
5179          * the loopback reference in rt6_info will not be taken, do it
5180          * manually for init_net */
5181         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5182         init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5183         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5184         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5185   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5186         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5187         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5188         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5189         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5190   #endif
5191 }
5192
5193 int __init ip6_route_init(void)
5194 {
5195         int ret;
5196         int cpu;
5197
5198         ret = -ENOMEM;
5199         ip6_dst_ops_template.kmem_cachep =
5200                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5201                                   SLAB_HWCACHE_ALIGN, NULL);
5202         if (!ip6_dst_ops_template.kmem_cachep)
5203                 goto out;
5204
5205         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5206         if (ret)
5207                 goto out_kmem_cache;
5208
5209         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5210         if (ret)
5211                 goto out_dst_entries;
5212
5213         ret = register_pernet_subsys(&ip6_route_net_ops);
5214         if (ret)
5215                 goto out_register_inetpeer;
5216
5217         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5218
5219         ret = fib6_init();
5220         if (ret)
5221                 goto out_register_subsys;
5222
5223         ret = xfrm6_init();
5224         if (ret)
5225                 goto out_fib6_init;
5226
5227         ret = fib6_rules_init();
5228         if (ret)
5229                 goto xfrm6_init;
5230
5231         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5232         if (ret)
5233                 goto fib6_rules_init;
5234
5235         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5236                                    inet6_rtm_newroute, NULL, 0);
5237         if (ret < 0)
5238                 goto out_register_late_subsys;
5239
5240         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5241                                    inet6_rtm_delroute, NULL, 0);
5242         if (ret < 0)
5243                 goto out_register_late_subsys;
5244
5245         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5246                                    inet6_rtm_getroute, NULL,
5247                                    RTNL_FLAG_DOIT_UNLOCKED);
5248         if (ret < 0)
5249                 goto out_register_late_subsys;
5250
5251         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5252         if (ret)
5253                 goto out_register_late_subsys;
5254
5255         for_each_possible_cpu(cpu) {
5256                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5257
5258                 INIT_LIST_HEAD(&ul->head);
5259                 spin_lock_init(&ul->lock);
5260         }
5261
5262 out:
5263         return ret;
5264
5265 out_register_late_subsys:
5266         rtnl_unregister_all(PF_INET6);
5267         unregister_pernet_subsys(&ip6_route_net_late_ops);
5268 fib6_rules_init:
5269         fib6_rules_cleanup();
5270 xfrm6_init:
5271         xfrm6_fini();
5272 out_fib6_init:
5273         fib6_gc_cleanup();
5274 out_register_subsys:
5275         unregister_pernet_subsys(&ip6_route_net_ops);
5276 out_register_inetpeer:
5277         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5278 out_dst_entries:
5279         dst_entries_destroy(&ip6_dst_blackhole_ops);
5280 out_kmem_cache:
5281         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5282         goto out;
5283 }
5284
5285 void ip6_route_cleanup(void)
5286 {
5287         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5288         unregister_pernet_subsys(&ip6_route_net_late_ops);
5289         fib6_rules_cleanup();
5290         xfrm6_fini();
5291         fib6_gc_cleanup();
5292         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5293         unregister_pernet_subsys(&ip6_route_net_ops);
5294         dst_entries_destroy(&ip6_dst_blackhole_ops);
5295         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5296 }