]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
net/ipv6: Fix missing rcu dereferences on from
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct fib6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct fib6_info fib6_null_entry_template = {
287         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .fib6_protocol  = RTPROT_KERNEL,
289         .fib6_metric    = ~(u32)0,
290         .fib6_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319 };
320
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322         .dst = {
323                 .__refcnt       = ATOMIC_INIT(1),
324                 .__use          = 1,
325                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
326                 .error          = -EINVAL,
327                 .input          = dst_discard,
328                 .output         = dst_discard_out,
329         },
330         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
331 };
332
333 #endif
334
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337         struct dst_entry *dst = &rt->dst;
338
339         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340         INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348                                         1, DST_OBSOLETE_FORCE_CHK, flags);
349
350         if (rt) {
351                 rt6_info_init(rt);
352                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353         }
354
355         return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361         struct rt6_info *rt = (struct rt6_info *)dst;
362         struct fib6_info *from;
363         struct inet6_dev *idev;
364
365         dst_destroy_metrics_generic(dst);
366         rt6_uncached_list_del(rt);
367
368         idev = rt->rt6i_idev;
369         if (idev) {
370                 rt->rt6i_idev = NULL;
371                 in6_dev_put(idev);
372         }
373
374         rcu_read_lock();
375         from = rcu_dereference(rt->from);
376         rcu_assign_pointer(rt->from, NULL);
377         fib6_info_release(from);
378         rcu_read_unlock();
379 }
380
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382                            int how)
383 {
384         struct rt6_info *rt = (struct rt6_info *)dst;
385         struct inet6_dev *idev = rt->rt6i_idev;
386         struct net_device *loopback_dev =
387                 dev_net(dev)->loopback_dev;
388
389         if (idev && idev->dev != loopback_dev) {
390                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391                 if (loopback_idev) {
392                         rt->rt6i_idev = loopback_idev;
393                         in6_dev_put(idev);
394                 }
395         }
396 }
397
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400         if (rt->rt6i_flags & RTF_EXPIRES)
401                 return time_after(jiffies, rt->dst.expires);
402         else
403                 return false;
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         struct fib6_info *from;
409
410         from = rcu_dereference(rt->from);
411
412         if (rt->rt6i_flags & RTF_EXPIRES) {
413                 if (time_after(jiffies, rt->dst.expires))
414                         return true;
415         } else if (from) {
416                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417                         fib6_check_expired(from);
418         }
419         return false;
420 }
421
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423                                               struct fib6_info *match,
424                                              struct flowi6 *fl6, int oif,
425                                              const struct sk_buff *skb,
426                                              int strict)
427 {
428         struct fib6_info *sibling, *next_sibling;
429
430         /* We might have already computed the hash for ICMPv6 errors. In such
431          * case it will always be non-zero. Otherwise now is the time to do it.
432          */
433         if (!fl6->mp_hash)
434                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435
436         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437                 return match;
438
439         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440                                  fib6_siblings) {
441                 int nh_upper_bound;
442
443                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444                 if (fl6->mp_hash > nh_upper_bound)
445                         continue;
446                 if (rt6_score_route(sibling, oif, strict) < 0)
447                         break;
448                 match = sibling;
449                 break;
450         }
451
452         return match;
453 }
454
455 /*
456  *      Route lookup. rcu_read_lock() should be held.
457  */
458
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460                                                  struct fib6_info *rt,
461                                                     const struct in6_addr *saddr,
462                                                     int oif,
463                                                     int flags)
464 {
465         struct fib6_info *sprt;
466
467         if (!oif && ipv6_addr_any(saddr) &&
468             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469                 return rt;
470
471         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
472                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
473
474                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475                         continue;
476
477                 if (oif) {
478                         if (dev->ifindex == oif)
479                                 return sprt;
480                 } else {
481                         if (ipv6_chk_addr(net, saddr, dev,
482                                           flags & RT6_LOOKUP_F_IFACE))
483                                 return sprt;
484                 }
485         }
486
487         if (oif && flags & RT6_LOOKUP_F_IFACE)
488                 return net->ipv6.fib6_null_entry;
489
490         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495         struct work_struct work;
496         struct in6_addr target;
497         struct net_device *dev;
498 };
499
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502         struct in6_addr mcaddr;
503         struct __rt6_probe_work *work =
504                 container_of(w, struct __rt6_probe_work, work);
505
506         addrconf_addr_solict_mult(&work->target, &mcaddr);
507         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508         dev_put(work->dev);
509         kfree(work);
510 }
511
512 static void rt6_probe(struct fib6_info *rt)
513 {
514         struct __rt6_probe_work *work;
515         const struct in6_addr *nh_gw;
516         struct neighbour *neigh;
517         struct net_device *dev;
518
519         /*
520          * Okay, this does not seem to be appropriate
521          * for now, however, we need to check if it
522          * is really so; aka Router Reachability Probing.
523          *
524          * Router Reachability Probe MUST be rate-limited
525          * to no more than one per minute.
526          */
527         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528                 return;
529
530         nh_gw = &rt->fib6_nh.nh_gw;
531         dev = rt->fib6_nh.nh_dev;
532         rcu_read_lock_bh();
533         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534         if (neigh) {
535                 struct inet6_dev *idev;
536
537                 if (neigh->nud_state & NUD_VALID)
538                         goto out;
539
540                 idev = __in6_dev_get(dev);
541                 work = NULL;
542                 write_lock(&neigh->lock);
543                 if (!(neigh->nud_state & NUD_VALID) &&
544                     time_after(jiffies,
545                                neigh->updated + idev->cnf.rtr_probe_interval)) {
546                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
547                         if (work)
548                                 __neigh_set_probe_once(neigh);
549                 }
550                 write_unlock(&neigh->lock);
551         } else {
552                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553         }
554
555         if (work) {
556                 INIT_WORK(&work->work, rt6_probe_deferred);
557                 work->target = *nh_gw;
558                 dev_hold(dev);
559                 work->dev = dev;
560                 schedule_work(&work->work);
561         }
562
563 out:
564         rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577         const struct net_device *dev = rt->fib6_nh.nh_dev;
578
579         if (!oif || dev->ifindex == oif)
580                 return 2;
581         return 0;
582 }
583
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587         struct neighbour *neigh;
588
589         if (rt->fib6_flags & RTF_NONEXTHOP ||
590             !(rt->fib6_flags & RTF_GATEWAY))
591                 return RT6_NUD_SUCCEED;
592
593         rcu_read_lock_bh();
594         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595                                           &rt->fib6_nh.nh_gw);
596         if (neigh) {
597                 read_lock(&neigh->lock);
598                 if (neigh->nud_state & NUD_VALID)
599                         ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601                 else if (!(neigh->nud_state & NUD_FAILED))
602                         ret = RT6_NUD_SUCCEED;
603                 else
604                         ret = RT6_NUD_FAIL_PROBE;
605 #endif
606                 read_unlock(&neigh->lock);
607         } else {
608                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610         }
611         rcu_read_unlock_bh();
612
613         return ret;
614 }
615
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618         int m;
619
620         m = rt6_check_dev(rt, oif);
621         if (!m && (strict & RT6_LOOKUP_F_IFACE))
622                 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626         if (strict & RT6_LOOKUP_F_REACHABLE) {
627                 int n = rt6_check_neigh(rt);
628                 if (n < 0)
629                         return n;
630         }
631         return m;
632 }
633
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637         const struct net_device *dev = fib6_info_nh_dev(f6i);
638         bool rc = false;
639
640         if (dev) {
641                 const struct inet6_dev *idev = __in6_dev_get(dev);
642
643                 rc = !!idev->cnf.ignore_routes_with_linkdown;
644         }
645
646         return rc;
647 }
648
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650                                    int *mpri, struct fib6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655
656         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657                 goto out;
658
659         if (fib6_ignore_linkdown(rt) &&
660             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662                 goto out;
663
664         if (fib6_check_expired(rt))
665                 goto out;
666
667         m = rt6_score_route(rt, oif, strict);
668         if (m == RT6_NUD_FAIL_DO_RR) {
669                 match_do_rr = true;
670                 m = 0; /* lowest valid score */
671         } else if (m == RT6_NUD_FAIL_HARD) {
672                 goto out;
673         }
674
675         if (strict & RT6_LOOKUP_F_REACHABLE)
676                 rt6_probe(rt);
677
678         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
679         if (m > *mpri) {
680                 *do_rr = match_do_rr;
681                 *mpri = m;
682                 match = rt;
683         }
684 out:
685         return match;
686 }
687
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689                                      struct fib6_info *leaf,
690                                      struct fib6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct fib6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
700                 if (rt->fib6_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = leaf; rt && rt != rr_head;
709              rt = rcu_dereference(rt->rt6_next)) {
710                 if (rt->fib6_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728                                    int oif, int strict)
729 {
730         struct fib6_info *leaf = rcu_dereference(fn->leaf);
731         struct fib6_info *match, *rt0;
732         bool do_rr = false;
733         int key_plen;
734
735         if (!leaf || leaf == net->ipv6.fib6_null_entry)
736                 return net->ipv6.fib6_null_entry;
737
738         rt0 = rcu_dereference(fn->rr_ptr);
739         if (!rt0)
740                 rt0 = leaf;
741
742         /* Double check to make sure fn is not an intermediate node
743          * and fn->leaf does not points to its child's leaf
744          * (This might happen if all routes under fn are deleted from
745          * the tree and fib6_repair_tree() is called on the node.)
746          */
747         key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749         if (rt0->fib6_src.plen)
750                 key_plen = rt0->fib6_src.plen;
751 #endif
752         if (fn->fn_bit != key_plen)
753                 return net->ipv6.fib6_null_entry;
754
755         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756                              &do_rr);
757
758         if (do_rr) {
759                 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
760
761                 /* no entries matched; do round-robin */
762                 if (!next || next->fib6_metric != rt0->fib6_metric)
763                         next = leaf;
764
765                 if (next != rt0) {
766                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
767                         /* make sure next is not being deleted from the tree */
768                         if (next->fib6_node)
769                                 rcu_assign_pointer(fn->rr_ptr, next);
770                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771                 }
772         }
773
774         return match ? match : net->ipv6.fib6_null_entry;
775 }
776
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784                   const struct in6_addr *gwaddr)
785 {
786         struct net *net = dev_net(dev);
787         struct route_info *rinfo = (struct route_info *) opt;
788         struct in6_addr prefix_buf, *prefix;
789         unsigned int pref;
790         unsigned long lifetime;
791         struct fib6_info *rt;
792
793         if (len < sizeof(struct route_info)) {
794                 return -EINVAL;
795         }
796
797         /* Sanity check for prefix_len and length */
798         if (rinfo->length > 3) {
799                 return -EINVAL;
800         } else if (rinfo->prefix_len > 128) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 64) {
803                 if (rinfo->length < 2) {
804                         return -EINVAL;
805                 }
806         } else if (rinfo->prefix_len > 0) {
807                 if (rinfo->length < 1) {
808                         return -EINVAL;
809                 }
810         }
811
812         pref = rinfo->route_pref;
813         if (pref == ICMPV6_ROUTER_PREF_INVALID)
814                 return -EINVAL;
815
816         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817
818         if (rinfo->length == 3)
819                 prefix = (struct in6_addr *)rinfo->prefix;
820         else {
821                 /* this function is safe */
822                 ipv6_addr_prefix(&prefix_buf,
823                                  (struct in6_addr *)rinfo->prefix,
824                                  rinfo->prefix_len);
825                 prefix = &prefix_buf;
826         }
827
828         if (rinfo->prefix_len == 0)
829                 rt = rt6_get_dflt_router(net, gwaddr, dev);
830         else
831                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832                                         gwaddr, dev);
833
834         if (rt && !lifetime) {
835                 ip6_del_rt(net, rt);
836                 rt = NULL;
837         }
838
839         if (!rt && lifetime)
840                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841                                         dev, pref);
842         else if (rt)
843                 rt->fib6_flags = RTF_ROUTEINFO |
844                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845
846         if (rt) {
847                 if (!addrconf_finite_timeout(lifetime))
848                         fib6_clean_expires(rt);
849                 else
850                         fib6_set_expires(rt, jiffies + HZ * lifetime);
851
852                 fib6_info_release(rt);
853         }
854         return 0;
855 }
856 #endif
857
858 /*
859  *      Misc support functions
860  */
861
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865         struct net_device *dev = rt->fib6_nh.nh_dev;
866
867         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868                 /* for copies of local routes, dst->dev needs to be the
869                  * device if it is a master device, the master device if
870                  * device is enslaved, and the loopback as the default
871                  */
872                 if (netif_is_l3_slave(dev) &&
873                     !rt6_need_strict(&rt->fib6_dst.addr))
874                         dev = l3mdev_master_dev_rcu(dev);
875                 else if (!netif_is_l3_master(dev))
876                         dev = dev_net(dev)->loopback_dev;
877                 /* last case is netif_is_l3_master(dev) is true in which
878                  * case we want dev returned to be dev
879                  */
880         }
881
882         return dev;
883 }
884
885 static const int fib6_prop[RTN_MAX + 1] = {
886         [RTN_UNSPEC]    = 0,
887         [RTN_UNICAST]   = 0,
888         [RTN_LOCAL]     = 0,
889         [RTN_BROADCAST] = 0,
890         [RTN_ANYCAST]   = 0,
891         [RTN_MULTICAST] = 0,
892         [RTN_BLACKHOLE] = -EINVAL,
893         [RTN_UNREACHABLE] = -EHOSTUNREACH,
894         [RTN_PROHIBIT]  = -EACCES,
895         [RTN_THROW]     = -EAGAIN,
896         [RTN_NAT]       = -EINVAL,
897         [RTN_XRESOLVE]  = -EINVAL,
898 };
899
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902         return fib6_prop[fib6_type];
903 }
904
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907         unsigned short flags = 0;
908
909         if (rt->dst_nocount)
910                 flags |= DST_NOCOUNT;
911         if (rt->dst_nopolicy)
912                 flags |= DST_NOPOLICY;
913         if (rt->dst_host)
914                 flags |= DST_HOST;
915
916         return flags;
917 }
918
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922
923         switch (ort->fib6_type) {
924         case RTN_BLACKHOLE:
925                 rt->dst.output = dst_discard_out;
926                 rt->dst.input = dst_discard;
927                 break;
928         case RTN_PROHIBIT:
929                 rt->dst.output = ip6_pkt_prohibit_out;
930                 rt->dst.input = ip6_pkt_prohibit;
931                 break;
932         case RTN_THROW:
933         case RTN_UNREACHABLE:
934         default:
935                 rt->dst.output = ip6_pkt_discard_out;
936                 rt->dst.input = ip6_pkt_discard;
937                 break;
938         }
939 }
940
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943         rt->dst.flags |= fib6_info_dst_flags(ort);
944
945         if (ort->fib6_flags & RTF_REJECT) {
946                 ip6_rt_init_dst_reject(rt, ort);
947                 return;
948         }
949
950         rt->dst.error = 0;
951         rt->dst.output = ip6_output;
952
953         if (ort->fib6_type == RTN_LOCAL) {
954                 rt->dst.input = ip6_input;
955         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956                 rt->dst.input = ip6_mc_input;
957         } else {
958                 rt->dst.input = ip6_forward;
959         }
960
961         if (ort->fib6_nh.nh_lwtstate) {
962                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963                 lwtunnel_set_redirect(&rt->dst);
964         }
965
966         rt->dst.lastuse = jiffies;
967 }
968
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971         rt->rt6i_flags &= ~RTF_EXPIRES;
972         fib6_info_hold(from);
973         rcu_assign_pointer(rt->from, from);
974         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975         if (from->fib6_metrics != &dst_default_metrics) {
976                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977                 refcount_inc(&from->fib6_metrics->refcnt);
978         }
979 }
980
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983         struct net_device *dev = fib6_info_nh_dev(ort);
984
985         ip6_rt_init_dst(rt, ort);
986
987         rt->rt6i_dst = ort->fib6_dst;
988         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990         rt->rt6i_flags = ort->fib6_flags;
991         rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993         rt->rt6i_src = ort->fib6_src;
994 #endif
995         rt->rt6i_prefsrc = ort->fib6_prefsrc;
996         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000                                         struct in6_addr *saddr)
1001 {
1002         struct fib6_node *pn, *sn;
1003         while (1) {
1004                 if (fn->fn_flags & RTN_TL_ROOT)
1005                         return NULL;
1006                 pn = rcu_dereference(fn->parent);
1007                 sn = FIB6_SUBTREE(pn);
1008                 if (sn && sn != fn)
1009                         fn = fib6_lookup(sn, NULL, saddr);
1010                 else
1011                         fn = pn;
1012                 if (fn->fn_flags & RTN_RTINFO)
1013                         return fn;
1014         }
1015 }
1016
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018                           bool null_fallback)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (null_fallback) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042         if (nrt)
1043                 ip6_rt_copy_init(nrt, rt);
1044
1045         return nrt;
1046 }
1047
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049                                              struct fib6_table *table,
1050                                              struct flowi6 *fl6,
1051                                              const struct sk_buff *skb,
1052                                              int flags)
1053 {
1054         struct fib6_info *f6i;
1055         struct fib6_node *fn;
1056         struct rt6_info *rt;
1057
1058         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059                 flags &= ~RT6_LOOKUP_F_IFACE;
1060
1061         rcu_read_lock();
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064         f6i = rcu_dereference(fn->leaf);
1065         if (!f6i) {
1066                 f6i = net->ipv6.fib6_null_entry;
1067         } else {
1068                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069                                       fl6->flowi6_oif, flags);
1070                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071                         f6i = rt6_multipath_select(net, f6i, fl6,
1072                                                    fl6->flowi6_oif, skb, flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         /* Search through exception table */
1081         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1082         if (rt) {
1083                 if (ip6_hold_safe(net, &rt, true))
1084                         dst_use_noref(&rt->dst, jiffies);
1085         } else if (f6i == net->ipv6.fib6_null_entry) {
1086                 rt = net->ipv6.ip6_null_entry;
1087                 dst_hold(&rt->dst);
1088         } else {
1089                 rt = ip6_create_rt_rcu(f6i);
1090                 if (!rt) {
1091                         rt = net->ipv6.ip6_null_entry;
1092                         dst_hold(&rt->dst);
1093                 }
1094         }
1095
1096         rcu_read_unlock();
1097
1098         trace_fib6_table_lookup(net, rt, table, fl6);
1099
1100         return rt;
1101 }
1102
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104                                    const struct sk_buff *skb, int flags)
1105 {
1106         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1109
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111                             const struct in6_addr *saddr, int oif,
1112                             const struct sk_buff *skb, int strict)
1113 {
1114         struct flowi6 fl6 = {
1115                 .flowi6_oif = oif,
1116                 .daddr = *daddr,
1117         };
1118         struct dst_entry *dst;
1119         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120
1121         if (saddr) {
1122                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1124         }
1125
1126         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127         if (dst->error == 0)
1128                 return (struct rt6_info *) dst;
1129
1130         dst_release(dst);
1131
1132         return NULL;
1133 }
1134 EXPORT_SYMBOL(rt6_lookup);
1135
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137  * It takes new route entry, the addition fails by any reason the
1138  * route is released.
1139  * Caller must hold dst before calling it.
1140  */
1141
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143                         struct netlink_ext_ack *extack)
1144 {
1145         int err;
1146         struct fib6_table *table;
1147
1148         table = rt->fib6_table;
1149         spin_lock_bh(&table->tb6_lock);
1150         err = fib6_add(&table->tb6_root, rt, info, extack);
1151         spin_unlock_bh(&table->tb6_lock);
1152
1153         return err;
1154 }
1155
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157 {
1158         struct nl_info info = { .nl_net = net, };
1159
1160         return __ip6_ins_rt(rt, &info, NULL);
1161 }
1162
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164                                            const struct in6_addr *daddr,
1165                                            const struct in6_addr *saddr)
1166 {
1167         struct net_device *dev;
1168         struct rt6_info *rt;
1169
1170         /*
1171          *      Clone the route.
1172          */
1173
1174         dev = ip6_rt_get_dev_rcu(ort);
1175         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176         if (!rt)
1177                 return NULL;
1178
1179         ip6_rt_copy_init(rt, ort);
1180         rt->rt6i_flags |= RTF_CACHE;
1181         rt->dst.flags |= DST_HOST;
1182         rt->rt6i_dst.addr = *daddr;
1183         rt->rt6i_dst.plen = 128;
1184
1185         if (!rt6_is_gw_or_nonexthop(ort)) {
1186                 if (ort->fib6_dst.plen != 128 &&
1187                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188                         rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190                 if (rt->rt6i_src.plen && saddr) {
1191                         rt->rt6i_src.addr = *saddr;
1192                         rt->rt6i_src.plen = 128;
1193                 }
1194 #endif
1195         }
1196
1197         return rt;
1198 }
1199
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1201 {
1202         unsigned short flags = fib6_info_dst_flags(rt);
1203         struct net_device *dev;
1204         struct rt6_info *pcpu_rt;
1205
1206         rcu_read_lock();
1207         dev = ip6_rt_get_dev_rcu(rt);
1208         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209         rcu_read_unlock();
1210         if (!pcpu_rt)
1211                 return NULL;
1212         ip6_rt_copy_init(pcpu_rt, rt);
1213         pcpu_rt->rt6i_flags |= RTF_PCPU;
1214         return pcpu_rt;
1215 }
1216
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1219 {
1220         struct rt6_info *pcpu_rt, **p;
1221
1222         p = this_cpu_ptr(rt->rt6i_pcpu);
1223         pcpu_rt = *p;
1224
1225         if (pcpu_rt)
1226                 ip6_hold_safe(NULL, &pcpu_rt, false);
1227
1228         return pcpu_rt;
1229 }
1230
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232                                             struct fib6_info *rt)
1233 {
1234         struct rt6_info *pcpu_rt, *prev, **p;
1235
1236         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1237         if (!pcpu_rt) {
1238                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239                 return net->ipv6.ip6_null_entry;
1240         }
1241
1242         dst_hold(&pcpu_rt->dst);
1243         p = this_cpu_ptr(rt->rt6i_pcpu);
1244         prev = cmpxchg(p, NULL, pcpu_rt);
1245         BUG_ON(prev);
1246
1247         return pcpu_rt;
1248 }
1249
1250 /* exception hash table implementation
1251  */
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1253
1254 /* Remove rt6_ex from hash table and free the memory
1255  * Caller must hold rt6_exception_lock
1256  */
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258                                  struct rt6_exception *rt6_ex)
1259 {
1260         struct net *net;
1261
1262         if (!bucket || !rt6_ex)
1263                 return;
1264
1265         net = dev_net(rt6_ex->rt6i->dst.dev);
1266         hlist_del_rcu(&rt6_ex->hlist);
1267         dst_release(&rt6_ex->rt6i->dst);
1268         kfree_rcu(rt6_ex, rcu);
1269         WARN_ON_ONCE(!bucket->depth);
1270         bucket->depth--;
1271         net->ipv6.rt6_stats->fib_rt_cache--;
1272 }
1273
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1278 {
1279         struct rt6_exception *rt6_ex, *oldest = NULL;
1280
1281         if (!bucket)
1282                 return;
1283
1284         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1286                         oldest = rt6_ex;
1287         }
1288         rt6_remove_exception(bucket, oldest);
1289 }
1290
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292                               const struct in6_addr *src)
1293 {
1294         static u32 seed __read_mostly;
1295         u32 val;
1296
1297         net_get_random_once(&seed, sizeof(seed));
1298         val = jhash(dst, sizeof(*dst), seed);
1299
1300 #ifdef CONFIG_IPV6_SUBTREES
1301         if (src)
1302                 val = jhash(src, sizeof(*src), val);
1303 #endif
1304         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1305 }
1306
1307 /* Helper function to find the cached rt in the hash table
1308  * and update bucket pointer to point to the bucket for this
1309  * (daddr, saddr) pair
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314                               const struct in6_addr *daddr,
1315                               const struct in6_addr *saddr)
1316 {
1317         struct rt6_exception *rt6_ex;
1318         u32 hval;
1319
1320         if (!(*bucket) || !daddr)
1321                 return NULL;
1322
1323         hval = rt6_exception_hash(daddr, saddr);
1324         *bucket += hval;
1325
1326         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327                 struct rt6_info *rt6 = rt6_ex->rt6i;
1328                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1329
1330 #ifdef CONFIG_IPV6_SUBTREES
1331                 if (matched && saddr)
1332                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1333 #endif
1334                 if (matched)
1335                         return rt6_ex;
1336         }
1337         return NULL;
1338 }
1339
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rcu_read_lock()
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347                          const struct in6_addr *daddr,
1348                          const struct in6_addr *saddr)
1349 {
1350         struct rt6_exception *rt6_ex;
1351         u32 hval;
1352
1353         WARN_ON_ONCE(!rcu_read_lock_held());
1354
1355         if (!(*bucket) || !daddr)
1356                 return NULL;
1357
1358         hval = rt6_exception_hash(daddr, saddr);
1359         *bucket += hval;
1360
1361         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362                 struct rt6_info *rt6 = rt6_ex->rt6i;
1363                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1364
1365 #ifdef CONFIG_IPV6_SUBTREES
1366                 if (matched && saddr)
1367                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 #endif
1369                 if (matched)
1370                         return rt6_ex;
1371         }
1372         return NULL;
1373 }
1374
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 {
1377         unsigned int mtu;
1378
1379         if (rt->fib6_pmtu) {
1380                 mtu = rt->fib6_pmtu;
1381         } else {
1382                 struct net_device *dev = fib6_info_nh_dev(rt);
1383                 struct inet6_dev *idev;
1384
1385                 rcu_read_lock();
1386                 idev = __in6_dev_get(dev);
1387                 mtu = idev->cnf.mtu6;
1388                 rcu_read_unlock();
1389         }
1390
1391         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1392
1393         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1394 }
1395
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397                                 struct fib6_info *ort)
1398 {
1399         struct net *net = dev_net(nrt->dst.dev);
1400         struct rt6_exception_bucket *bucket;
1401         struct in6_addr *src_key = NULL;
1402         struct rt6_exception *rt6_ex;
1403         int err = 0;
1404
1405         spin_lock_bh(&rt6_exception_lock);
1406
1407         if (ort->exception_bucket_flushed) {
1408                 err = -EINVAL;
1409                 goto out;
1410         }
1411
1412         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413                                         lockdep_is_held(&rt6_exception_lock));
1414         if (!bucket) {
1415                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416                                  GFP_ATOMIC);
1417                 if (!bucket) {
1418                         err = -ENOMEM;
1419                         goto out;
1420                 }
1421                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1422         }
1423
1424 #ifdef CONFIG_IPV6_SUBTREES
1425         /* rt6i_src.plen != 0 indicates ort is in subtree
1426          * and exception table is indexed by a hash of
1427          * both rt6i_dst and rt6i_src.
1428          * Otherwise, the exception table is indexed by
1429          * a hash of only rt6i_dst.
1430          */
1431         if (ort->fib6_src.plen)
1432                 src_key = &nrt->rt6i_src.addr;
1433 #endif
1434
1435         /* Update rt6i_prefsrc as it could be changed
1436          * in rt6_remove_prefsrc()
1437          */
1438         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439         /* rt6_mtu_change() might lower mtu on ort.
1440          * Only insert this exception route if its mtu
1441          * is less than ort's mtu value.
1442          */
1443         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444                 err = -EINVAL;
1445                 goto out;
1446         }
1447
1448         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1449                                                src_key);
1450         if (rt6_ex)
1451                 rt6_remove_exception(bucket, rt6_ex);
1452
1453         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454         if (!rt6_ex) {
1455                 err = -ENOMEM;
1456                 goto out;
1457         }
1458         rt6_ex->rt6i = nrt;
1459         rt6_ex->stamp = jiffies;
1460         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1461         bucket->depth++;
1462         net->ipv6.rt6_stats->fib_rt_cache++;
1463
1464         if (bucket->depth > FIB6_MAX_DEPTH)
1465                 rt6_exception_remove_oldest(bucket);
1466
1467 out:
1468         spin_unlock_bh(&rt6_exception_lock);
1469
1470         /* Update fn->fn_sernum to invalidate all cached dst */
1471         if (!err) {
1472                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473                 fib6_update_sernum(net, ort);
1474                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475                 fib6_force_start_gc(net);
1476         }
1477
1478         return err;
1479 }
1480
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1482 {
1483         struct rt6_exception_bucket *bucket;
1484         struct rt6_exception *rt6_ex;
1485         struct hlist_node *tmp;
1486         int i;
1487
1488         spin_lock_bh(&rt6_exception_lock);
1489         /* Prevent rt6_insert_exception() to recreate the bucket list */
1490         rt->exception_bucket_flushed = 1;
1491
1492         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493                                     lockdep_is_held(&rt6_exception_lock));
1494         if (!bucket)
1495                 goto out;
1496
1497         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499                         rt6_remove_exception(bucket, rt6_ex);
1500                 WARN_ON_ONCE(bucket->depth);
1501                 bucket++;
1502         }
1503
1504 out:
1505         spin_unlock_bh(&rt6_exception_lock);
1506 }
1507
1508 /* Find cached rt in the hash table inside passed in rt
1509  * Caller has to hold rcu_read_lock()
1510  */
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512                                            struct in6_addr *daddr,
1513                                            struct in6_addr *saddr)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct in6_addr *src_key = NULL;
1517         struct rt6_exception *rt6_ex;
1518         struct rt6_info *res = NULL;
1519
1520         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1521
1522 #ifdef CONFIG_IPV6_SUBTREES
1523         /* rt6i_src.plen != 0 indicates rt is in subtree
1524          * and exception table is indexed by a hash of
1525          * both rt6i_dst and rt6i_src.
1526          * Otherwise, the exception table is indexed by
1527          * a hash of only rt6i_dst.
1528          */
1529         if (rt->fib6_src.plen)
1530                 src_key = saddr;
1531 #endif
1532         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1533
1534         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535                 res = rt6_ex->rt6i;
1536
1537         return res;
1538 }
1539
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct in6_addr *src_key = NULL;
1545         struct rt6_exception *rt6_ex;
1546         struct fib6_info *from;
1547         int err;
1548
1549         from = rcu_dereference_protected(rt->from,
1550                                          lockdep_is_held(&rt6_exception_lock));
1551         if (!from ||
1552             !(rt->rt6i_flags & RTF_CACHE))
1553                 return -EINVAL;
1554
1555         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556                 return -ENOENT;
1557
1558         spin_lock_bh(&rt6_exception_lock);
1559         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560                                     lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1563          * and exception table is indexed by a hash of
1564          * both rt6i_dst and rt6i_src.
1565          * Otherwise, the exception table is indexed by
1566          * a hash of only rt6i_dst.
1567          */
1568         if (from->fib6_src.plen)
1569                 src_key = &rt->rt6i_src.addr;
1570 #endif
1571         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1572                                                &rt->rt6i_dst.addr,
1573                                                src_key);
1574         if (rt6_ex) {
1575                 rt6_remove_exception(bucket, rt6_ex);
1576                 err = 0;
1577         } else {
1578                 err = -ENOENT;
1579         }
1580
1581         spin_unlock_bh(&rt6_exception_lock);
1582         return err;
1583 }
1584
1585 /* Find rt6_ex which contains the passed in rt cache and
1586  * refresh its stamp
1587  */
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 {
1590         struct rt6_exception_bucket *bucket;
1591         struct fib6_info *from = rt->from;
1592         struct in6_addr *src_key = NULL;
1593         struct rt6_exception *rt6_ex;
1594
1595         if (!from ||
1596             !(rt->rt6i_flags & RTF_CACHE))
1597                 return;
1598
1599         rcu_read_lock();
1600         bucket = rcu_dereference(from->rt6i_exception_bucket);
1601
1602 #ifdef CONFIG_IPV6_SUBTREES
1603         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1604          * and exception table is indexed by a hash of
1605          * both rt6i_dst and rt6i_src.
1606          * Otherwise, the exception table is indexed by
1607          * a hash of only rt6i_dst.
1608          */
1609         if (from->fib6_src.plen)
1610                 src_key = &rt->rt6i_src.addr;
1611 #endif
1612         rt6_ex = __rt6_find_exception_rcu(&bucket,
1613                                           &rt->rt6i_dst.addr,
1614                                           src_key);
1615         if (rt6_ex)
1616                 rt6_ex->stamp = jiffies;
1617
1618         rcu_read_unlock();
1619 }
1620
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 {
1623         struct rt6_exception_bucket *bucket;
1624         struct rt6_exception *rt6_ex;
1625         int i;
1626
1627         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628                                         lockdep_is_held(&rt6_exception_lock));
1629
1630         if (bucket) {
1631                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1634                         }
1635                         bucket++;
1636                 }
1637         }
1638 }
1639
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641                                          struct rt6_info *rt, int mtu)
1642 {
1643         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1644          * lowest MTU in the path: always allow updating the route PMTU to
1645          * reflect PMTU decreases.
1646          *
1647          * If the new MTU is higher, and the route PMTU is equal to the local
1648          * MTU, this means the old MTU is the lowest in the path, so allow
1649          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1650          * handle this.
1651          */
1652
1653         if (dst_mtu(&rt->dst) >= mtu)
1654                 return true;
1655
1656         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657                 return true;
1658
1659         return false;
1660 }
1661
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663                                        struct fib6_info *rt, int mtu)
1664 {
1665         struct rt6_exception_bucket *bucket;
1666         struct rt6_exception *rt6_ex;
1667         int i;
1668
1669         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670                                         lockdep_is_held(&rt6_exception_lock));
1671
1672         if (!bucket)
1673                 return;
1674
1675         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677                         struct rt6_info *entry = rt6_ex->rt6i;
1678
1679                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680                          * route), the metrics of its rt->from have already
1681                          * been updated.
1682                          */
1683                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1685                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686                 }
1687                 bucket++;
1688         }
1689 }
1690
1691 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1692
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694                                         struct in6_addr *gateway)
1695 {
1696         struct rt6_exception_bucket *bucket;
1697         struct rt6_exception *rt6_ex;
1698         struct hlist_node *tmp;
1699         int i;
1700
1701         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702                 return;
1703
1704         spin_lock_bh(&rt6_exception_lock);
1705         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706                                      lockdep_is_held(&rt6_exception_lock));
1707
1708         if (bucket) {
1709                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710                         hlist_for_each_entry_safe(rt6_ex, tmp,
1711                                                   &bucket->chain, hlist) {
1712                                 struct rt6_info *entry = rt6_ex->rt6i;
1713
1714                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715                                     RTF_CACHE_GATEWAY &&
1716                                     ipv6_addr_equal(gateway,
1717                                                     &entry->rt6i_gateway)) {
1718                                         rt6_remove_exception(bucket, rt6_ex);
1719                                 }
1720                         }
1721                         bucket++;
1722                 }
1723         }
1724
1725         spin_unlock_bh(&rt6_exception_lock);
1726 }
1727
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729                                       struct rt6_exception *rt6_ex,
1730                                       struct fib6_gc_args *gc_args,
1731                                       unsigned long now)
1732 {
1733         struct rt6_info *rt = rt6_ex->rt6i;
1734
1735         /* we are pruning and obsoleting aged-out and non gateway exceptions
1736          * even if others have still references to them, so that on next
1737          * dst_check() such references can be dropped.
1738          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739          * expired, independently from their aging, as per RFC 8201 section 4
1740          */
1741         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743                         RT6_TRACE("aging clone %p\n", rt);
1744                         rt6_remove_exception(bucket, rt6_ex);
1745                         return;
1746                 }
1747         } else if (time_after(jiffies, rt->dst.expires)) {
1748                 RT6_TRACE("purging expired route %p\n", rt);
1749                 rt6_remove_exception(bucket, rt6_ex);
1750                 return;
1751         }
1752
1753         if (rt->rt6i_flags & RTF_GATEWAY) {
1754                 struct neighbour *neigh;
1755                 __u8 neigh_flags = 0;
1756
1757                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758                 if (neigh)
1759                         neigh_flags = neigh->flags;
1760
1761                 if (!(neigh_flags & NTF_ROUTER)) {
1762                         RT6_TRACE("purging route %p via non-router but gateway\n",
1763                                   rt);
1764                         rt6_remove_exception(bucket, rt6_ex);
1765                         return;
1766                 }
1767         }
1768
1769         gc_args->more++;
1770 }
1771
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773                         struct fib6_gc_args *gc_args,
1774                         unsigned long now)
1775 {
1776         struct rt6_exception_bucket *bucket;
1777         struct rt6_exception *rt6_ex;
1778         struct hlist_node *tmp;
1779         int i;
1780
1781         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1782                 return;
1783
1784         rcu_read_lock_bh();
1785         spin_lock(&rt6_exception_lock);
1786         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787                                     lockdep_is_held(&rt6_exception_lock));
1788
1789         if (bucket) {
1790                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791                         hlist_for_each_entry_safe(rt6_ex, tmp,
1792                                                   &bucket->chain, hlist) {
1793                                 rt6_age_examine_exception(bucket, rt6_ex,
1794                                                           gc_args, now);
1795                         }
1796                         bucket++;
1797                 }
1798         }
1799         spin_unlock(&rt6_exception_lock);
1800         rcu_read_unlock_bh();
1801 }
1802
1803 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1804                                int oif, struct flowi6 *fl6,
1805                                const struct sk_buff *skb, int flags)
1806 {
1807         struct fib6_node *fn, *saved_fn;
1808         struct fib6_info *f6i;
1809         struct rt6_info *rt;
1810         int strict = 0;
1811
1812         strict |= flags & RT6_LOOKUP_F_IFACE;
1813         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1814         if (net->ipv6.devconf_all->forwarding == 0)
1815                 strict |= RT6_LOOKUP_F_REACHABLE;
1816
1817         rcu_read_lock();
1818
1819         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1820         saved_fn = fn;
1821
1822         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1823                 oif = 0;
1824
1825 redo_rt6_select:
1826         f6i = rt6_select(net, fn, oif, strict);
1827         if (f6i->fib6_nsiblings)
1828                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1829         if (f6i == net->ipv6.fib6_null_entry) {
1830                 fn = fib6_backtrack(fn, &fl6->saddr);
1831                 if (fn)
1832                         goto redo_rt6_select;
1833                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1834                         /* also consider unreachable route */
1835                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1836                         fn = saved_fn;
1837                         goto redo_rt6_select;
1838                 }
1839         }
1840
1841         if (f6i == net->ipv6.fib6_null_entry) {
1842                 rt = net->ipv6.ip6_null_entry;
1843                 rcu_read_unlock();
1844                 dst_hold(&rt->dst);
1845                 trace_fib6_table_lookup(net, rt, table, fl6);
1846                 return rt;
1847         }
1848
1849         /*Search through exception table */
1850         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1851         if (rt) {
1852                 if (ip6_hold_safe(net, &rt, true))
1853                         dst_use_noref(&rt->dst, jiffies);
1854
1855                 rcu_read_unlock();
1856                 trace_fib6_table_lookup(net, rt, table, fl6);
1857                 return rt;
1858         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1859                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1860                 /* Create a RTF_CACHE clone which will not be
1861                  * owned by the fib6 tree.  It is for the special case where
1862                  * the daddr in the skb during the neighbor look-up is different
1863                  * from the fl6->daddr used to look-up route here.
1864                  */
1865                 struct rt6_info *uncached_rt;
1866
1867                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1868
1869                 rcu_read_unlock();
1870
1871                 if (uncached_rt) {
1872                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1873                          * No need for another dst_hold()
1874                          */
1875                         rt6_uncached_list_add(uncached_rt);
1876                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1877                 } else {
1878                         uncached_rt = net->ipv6.ip6_null_entry;
1879                         dst_hold(&uncached_rt->dst);
1880                 }
1881
1882                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1883                 return uncached_rt;
1884
1885         } else {
1886                 /* Get a percpu copy */
1887
1888                 struct rt6_info *pcpu_rt;
1889
1890                 local_bh_disable();
1891                 pcpu_rt = rt6_get_pcpu_route(f6i);
1892
1893                 if (!pcpu_rt)
1894                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1895
1896                 local_bh_enable();
1897                 rcu_read_unlock();
1898                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1899                 return pcpu_rt;
1900         }
1901 }
1902 EXPORT_SYMBOL_GPL(ip6_pol_route);
1903
1904 static struct rt6_info *ip6_pol_route_input(struct net *net,
1905                                             struct fib6_table *table,
1906                                             struct flowi6 *fl6,
1907                                             const struct sk_buff *skb,
1908                                             int flags)
1909 {
1910         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1911 }
1912
1913 struct dst_entry *ip6_route_input_lookup(struct net *net,
1914                                          struct net_device *dev,
1915                                          struct flowi6 *fl6,
1916                                          const struct sk_buff *skb,
1917                                          int flags)
1918 {
1919         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1920                 flags |= RT6_LOOKUP_F_IFACE;
1921
1922         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1923 }
1924 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1925
1926 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1927                                   struct flow_keys *keys,
1928                                   struct flow_keys *flkeys)
1929 {
1930         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1931         const struct ipv6hdr *key_iph = outer_iph;
1932         struct flow_keys *_flkeys = flkeys;
1933         const struct ipv6hdr *inner_iph;
1934         const struct icmp6hdr *icmph;
1935         struct ipv6hdr _inner_iph;
1936
1937         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                 goto out;
1939
1940         icmph = icmp6_hdr(skb);
1941         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1942             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1943             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1944             icmph->icmp6_type != ICMPV6_PARAMPROB)
1945                 goto out;
1946
1947         inner_iph = skb_header_pointer(skb,
1948                                        skb_transport_offset(skb) + sizeof(*icmph),
1949                                        sizeof(_inner_iph), &_inner_iph);
1950         if (!inner_iph)
1951                 goto out;
1952
1953         key_iph = inner_iph;
1954         _flkeys = NULL;
1955 out:
1956         if (_flkeys) {
1957                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1958                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1959                 keys->tags.flow_label = _flkeys->tags.flow_label;
1960                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1961         } else {
1962                 keys->addrs.v6addrs.src = key_iph->saddr;
1963                 keys->addrs.v6addrs.dst = key_iph->daddr;
1964                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1965                 keys->basic.ip_proto = key_iph->nexthdr;
1966         }
1967 }
1968
1969 /* if skb is set it will be used and fl6 can be NULL */
1970 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1971                        const struct sk_buff *skb, struct flow_keys *flkeys)
1972 {
1973         struct flow_keys hash_keys;
1974         u32 mhash;
1975
1976         switch (ip6_multipath_hash_policy(net)) {
1977         case 0:
1978                 memset(&hash_keys, 0, sizeof(hash_keys));
1979                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1980                 if (skb) {
1981                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1982                 } else {
1983                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1984                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1985                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1986                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1987                 }
1988                 break;
1989         case 1:
1990                 if (skb) {
1991                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1992                         struct flow_keys keys;
1993
1994                         /* short-circuit if we already have L4 hash present */
1995                         if (skb->l4_hash)
1996                                 return skb_get_hash_raw(skb) >> 1;
1997
1998                         memset(&hash_keys, 0, sizeof(hash_keys));
1999
2000                         if (!flkeys) {
2001                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2002                                 flkeys = &keys;
2003                         }
2004                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2005                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2006                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2007                         hash_keys.ports.src = flkeys->ports.src;
2008                         hash_keys.ports.dst = flkeys->ports.dst;
2009                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2010                 } else {
2011                         memset(&hash_keys, 0, sizeof(hash_keys));
2012                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2013                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2014                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2015                         hash_keys.ports.src = fl6->fl6_sport;
2016                         hash_keys.ports.dst = fl6->fl6_dport;
2017                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2018                 }
2019                 break;
2020         }
2021         mhash = flow_hash_from_keys(&hash_keys);
2022
2023         return mhash >> 1;
2024 }
2025
2026 void ip6_route_input(struct sk_buff *skb)
2027 {
2028         const struct ipv6hdr *iph = ipv6_hdr(skb);
2029         struct net *net = dev_net(skb->dev);
2030         int flags = RT6_LOOKUP_F_HAS_SADDR;
2031         struct ip_tunnel_info *tun_info;
2032         struct flowi6 fl6 = {
2033                 .flowi6_iif = skb->dev->ifindex,
2034                 .daddr = iph->daddr,
2035                 .saddr = iph->saddr,
2036                 .flowlabel = ip6_flowinfo(iph),
2037                 .flowi6_mark = skb->mark,
2038                 .flowi6_proto = iph->nexthdr,
2039         };
2040         struct flow_keys *flkeys = NULL, _flkeys;
2041
2042         tun_info = skb_tunnel_info(skb);
2043         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2044                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2045
2046         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2047                 flkeys = &_flkeys;
2048
2049         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2050                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2051         skb_dst_drop(skb);
2052         skb_dst_set(skb,
2053                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2054 }
2055
2056 static struct rt6_info *ip6_pol_route_output(struct net *net,
2057                                              struct fib6_table *table,
2058                                              struct flowi6 *fl6,
2059                                              const struct sk_buff *skb,
2060                                              int flags)
2061 {
2062         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2063 }
2064
2065 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2066                                          struct flowi6 *fl6, int flags)
2067 {
2068         bool any_src;
2069
2070         if (rt6_need_strict(&fl6->daddr)) {
2071                 struct dst_entry *dst;
2072
2073                 dst = l3mdev_link_scope_lookup(net, fl6);
2074                 if (dst)
2075                         return dst;
2076         }
2077
2078         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2079
2080         any_src = ipv6_addr_any(&fl6->saddr);
2081         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2082             (fl6->flowi6_oif && any_src))
2083                 flags |= RT6_LOOKUP_F_IFACE;
2084
2085         if (!any_src)
2086                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2087         else if (sk)
2088                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2089
2090         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2091 }
2092 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2093
2094 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2095 {
2096         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2097         struct net_device *loopback_dev = net->loopback_dev;
2098         struct dst_entry *new = NULL;
2099
2100         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2101                        DST_OBSOLETE_DEAD, 0);
2102         if (rt) {
2103                 rt6_info_init(rt);
2104                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2105
2106                 new = &rt->dst;
2107                 new->__use = 1;
2108                 new->input = dst_discard;
2109                 new->output = dst_discard_out;
2110
2111                 dst_copy_metrics(new, &ort->dst);
2112
2113                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2114                 rt->rt6i_gateway = ort->rt6i_gateway;
2115                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2116
2117                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2118 #ifdef CONFIG_IPV6_SUBTREES
2119                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2120 #endif
2121         }
2122
2123         dst_release(dst_orig);
2124         return new ? new : ERR_PTR(-ENOMEM);
2125 }
2126
2127 /*
2128  *      Destination cache support functions
2129  */
2130
2131 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2132 {
2133         u32 rt_cookie = 0;
2134
2135         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2136                 return false;
2137
2138         if (fib6_check_expired(f6i))
2139                 return false;
2140
2141         return true;
2142 }
2143
2144 static struct dst_entry *rt6_check(struct rt6_info *rt,
2145                                    struct fib6_info *from,
2146                                    u32 cookie)
2147 {
2148         u32 rt_cookie = 0;
2149
2150         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2151             rt_cookie != cookie)
2152                 return NULL;
2153
2154         if (rt6_check_expired(rt))
2155                 return NULL;
2156
2157         return &rt->dst;
2158 }
2159
2160 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2161                                             struct fib6_info *from,
2162                                             u32 cookie)
2163 {
2164         if (!__rt6_check_expired(rt) &&
2165             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2166             fib6_check(from, cookie))
2167                 return &rt->dst;
2168         else
2169                 return NULL;
2170 }
2171
2172 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2173 {
2174         struct dst_entry *dst_ret;
2175         struct fib6_info *from;
2176         struct rt6_info *rt;
2177
2178         rt = container_of(dst, struct rt6_info, dst);
2179
2180         rcu_read_lock();
2181
2182         /* All IPV6 dsts are created with ->obsolete set to the value
2183          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2184          * into this function always.
2185          */
2186
2187         from = rcu_dereference(rt->from);
2188
2189         if (from && (rt->rt6i_flags & RTF_PCPU ||
2190             unlikely(!list_empty(&rt->rt6i_uncached))))
2191                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2192         else
2193                 dst_ret = rt6_check(rt, from, cookie);
2194
2195         rcu_read_unlock();
2196
2197         return dst_ret;
2198 }
2199
2200 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2201 {
2202         struct rt6_info *rt = (struct rt6_info *) dst;
2203
2204         if (rt) {
2205                 if (rt->rt6i_flags & RTF_CACHE) {
2206                         rcu_read_lock();
2207                         if (rt6_check_expired(rt)) {
2208                                 rt6_remove_exception_rt(rt);
2209                                 dst = NULL;
2210                         }
2211                         rcu_read_unlock();
2212                 } else {
2213                         dst_release(dst);
2214                         dst = NULL;
2215                 }
2216         }
2217         return dst;
2218 }
2219
2220 static void ip6_link_failure(struct sk_buff *skb)
2221 {
2222         struct rt6_info *rt;
2223
2224         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2225
2226         rt = (struct rt6_info *) skb_dst(skb);
2227         if (rt) {
2228                 rcu_read_lock();
2229                 if (rt->rt6i_flags & RTF_CACHE) {
2230                         if (dst_hold_safe(&rt->dst))
2231                                 rt6_remove_exception_rt(rt);
2232                 } else {
2233                         struct fib6_info *from;
2234                         struct fib6_node *fn;
2235
2236                         from = rcu_dereference(rt->from);
2237                         if (from) {
2238                                 fn = rcu_dereference(from->fib6_node);
2239                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2240                                         fn->fn_sernum = -1;
2241                         }
2242                 }
2243                 rcu_read_unlock();
2244         }
2245 }
2246
2247 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2248 {
2249         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2250                 struct fib6_info *from;
2251
2252                 rcu_read_lock();
2253                 from = rcu_dereference(rt0->from);
2254                 if (from)
2255                         rt0->dst.expires = from->expires;
2256                 rcu_read_unlock();
2257         }
2258
2259         dst_set_expires(&rt0->dst, timeout);
2260         rt0->rt6i_flags |= RTF_EXPIRES;
2261 }
2262
2263 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2264 {
2265         struct net *net = dev_net(rt->dst.dev);
2266
2267         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2268         rt->rt6i_flags |= RTF_MODIFIED;
2269         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2270 }
2271
2272 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2273 {
2274         bool from_set;
2275
2276         rcu_read_lock();
2277         from_set = !!rcu_dereference(rt->from);
2278         rcu_read_unlock();
2279
2280         return !(rt->rt6i_flags & RTF_CACHE) &&
2281                 (rt->rt6i_flags & RTF_PCPU || from_set);
2282 }
2283
2284 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2285                                  const struct ipv6hdr *iph, u32 mtu)
2286 {
2287         const struct in6_addr *daddr, *saddr;
2288         struct rt6_info *rt6 = (struct rt6_info *)dst;
2289
2290         if (rt6->rt6i_flags & RTF_LOCAL)
2291                 return;
2292
2293         if (dst_metric_locked(dst, RTAX_MTU))
2294                 return;
2295
2296         if (iph) {
2297                 daddr = &iph->daddr;
2298                 saddr = &iph->saddr;
2299         } else if (sk) {
2300                 daddr = &sk->sk_v6_daddr;
2301                 saddr = &inet6_sk(sk)->saddr;
2302         } else {
2303                 daddr = NULL;
2304                 saddr = NULL;
2305         }
2306         dst_confirm_neigh(dst, daddr);
2307         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2308         if (mtu >= dst_mtu(dst))
2309                 return;
2310
2311         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2312                 rt6_do_update_pmtu(rt6, mtu);
2313                 /* update rt6_ex->stamp for cache */
2314                 if (rt6->rt6i_flags & RTF_CACHE)
2315                         rt6_update_exception_stamp_rt(rt6);
2316         } else if (daddr) {
2317                 struct fib6_info *from;
2318                 struct rt6_info *nrt6;
2319
2320                 rcu_read_lock();
2321                 from = rcu_dereference(rt6->from);
2322                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2323                 if (nrt6) {
2324                         rt6_do_update_pmtu(nrt6, mtu);
2325                         if (rt6_insert_exception(nrt6, from))
2326                                 dst_release_immediate(&nrt6->dst);
2327                 }
2328                 rcu_read_unlock();
2329         }
2330 }
2331
2332 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333                                struct sk_buff *skb, u32 mtu)
2334 {
2335         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2336 }
2337
2338 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2339                      int oif, u32 mark, kuid_t uid)
2340 {
2341         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2342         struct dst_entry *dst;
2343         struct flowi6 fl6;
2344
2345         memset(&fl6, 0, sizeof(fl6));
2346         fl6.flowi6_oif = oif;
2347         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2348         fl6.daddr = iph->daddr;
2349         fl6.saddr = iph->saddr;
2350         fl6.flowlabel = ip6_flowinfo(iph);
2351         fl6.flowi6_uid = uid;
2352
2353         dst = ip6_route_output(net, NULL, &fl6);
2354         if (!dst->error)
2355                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356         dst_release(dst);
2357 }
2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359
2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361 {
2362         struct dst_entry *dst;
2363
2364         ip6_update_pmtu(skb, sock_net(sk), mtu,
2365                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2366
2367         dst = __sk_dst_get(sk);
2368         if (!dst || !dst->obsolete ||
2369             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2370                 return;
2371
2372         bh_lock_sock(sk);
2373         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374                 ip6_datagram_dst_update(sk, false);
2375         bh_unlock_sock(sk);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2378
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380                            const struct flowi6 *fl6)
2381 {
2382 #ifdef CONFIG_IPV6_SUBTREES
2383         struct ipv6_pinfo *np = inet6_sk(sk);
2384 #endif
2385
2386         ip6_dst_store(sk, dst,
2387                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388                       &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2391                       &np->saddr :
2392 #endif
2393                       NULL);
2394 }
2395
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2398         struct flowi6 fl6;
2399         struct in6_addr gateway;
2400 };
2401
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403                                              struct fib6_table *table,
2404                                              struct flowi6 *fl6,
2405                                              const struct sk_buff *skb,
2406                                              int flags)
2407 {
2408         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409         struct rt6_info *ret = NULL, *rt_cache;
2410         struct fib6_info *rt;
2411         struct fib6_node *fn;
2412
2413         /* Get the "current" route for this destination and
2414          * check if the redirect has come from appropriate router.
2415          *
2416          * RFC 4861 specifies that redirects should only be
2417          * accepted if they come from the nexthop to the target.
2418          * Due to the way the routes are chosen, this notion
2419          * is a bit fuzzy and one might need to check all possible
2420          * routes.
2421          */
2422
2423         rcu_read_lock();
2424         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2425 restart:
2426         for_each_fib6_node_rt_rcu(fn) {
2427                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2428                         continue;
2429                 if (fib6_check_expired(rt))
2430                         continue;
2431                 if (rt->fib6_flags & RTF_REJECT)
2432                         break;
2433                 if (!(rt->fib6_flags & RTF_GATEWAY))
2434                         continue;
2435                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2436                         continue;
2437                 /* rt_cache's gateway might be different from its 'parent'
2438                  * in the case of an ip redirect.
2439                  * So we keep searching in the exception table if the gateway
2440                  * is different.
2441                  */
2442                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2443                         rt_cache = rt6_find_cached_rt(rt,
2444                                                       &fl6->daddr,
2445                                                       &fl6->saddr);
2446                         if (rt_cache &&
2447                             ipv6_addr_equal(&rdfl->gateway,
2448                                             &rt_cache->rt6i_gateway)) {
2449                                 ret = rt_cache;
2450                                 break;
2451                         }
2452                         continue;
2453                 }
2454                 break;
2455         }
2456
2457         if (!rt)
2458                 rt = net->ipv6.fib6_null_entry;
2459         else if (rt->fib6_flags & RTF_REJECT) {
2460                 ret = net->ipv6.ip6_null_entry;
2461                 goto out;
2462         }
2463
2464         if (rt == net->ipv6.fib6_null_entry) {
2465                 fn = fib6_backtrack(fn, &fl6->saddr);
2466                 if (fn)
2467                         goto restart;
2468         }
2469
2470 out:
2471         if (ret)
2472                 dst_hold(&ret->dst);
2473         else
2474                 ret = ip6_create_rt_rcu(rt);
2475
2476         rcu_read_unlock();
2477
2478         trace_fib6_table_lookup(net, ret, table, fl6);
2479         return ret;
2480 };
2481
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483                                             const struct flowi6 *fl6,
2484                                             const struct sk_buff *skb,
2485                                             const struct in6_addr *gateway)
2486 {
2487         int flags = RT6_LOOKUP_F_HAS_SADDR;
2488         struct ip6rd_flowi rdfl;
2489
2490         rdfl.fl6 = *fl6;
2491         rdfl.gateway = *gateway;
2492
2493         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494                                 flags, __ip6_route_redirect);
2495 }
2496
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2498                   kuid_t uid)
2499 {
2500         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501         struct dst_entry *dst;
2502         struct flowi6 fl6;
2503
2504         memset(&fl6, 0, sizeof(fl6));
2505         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2506         fl6.flowi6_oif = oif;
2507         fl6.flowi6_mark = mark;
2508         fl6.daddr = iph->daddr;
2509         fl6.saddr = iph->saddr;
2510         fl6.flowlabel = ip6_flowinfo(iph);
2511         fl6.flowi6_uid = uid;
2512
2513         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2514         rt6_do_redirect(dst, NULL, skb);
2515         dst_release(dst);
2516 }
2517 EXPORT_SYMBOL_GPL(ip6_redirect);
2518
2519 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2520                             u32 mark)
2521 {
2522         const struct ipv6hdr *iph = ipv6_hdr(skb);
2523         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2524         struct dst_entry *dst;
2525         struct flowi6 fl6;
2526
2527         memset(&fl6, 0, sizeof(fl6));
2528         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2529         fl6.flowi6_oif = oif;
2530         fl6.flowi6_mark = mark;
2531         fl6.daddr = msg->dest;
2532         fl6.saddr = iph->daddr;
2533         fl6.flowi6_uid = sock_net_uid(net, NULL);
2534
2535         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2536         rt6_do_redirect(dst, NULL, skb);
2537         dst_release(dst);
2538 }
2539
2540 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2541 {
2542         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2543                      sk->sk_uid);
2544 }
2545 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2546
2547 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2548 {
2549         struct net_device *dev = dst->dev;
2550         unsigned int mtu = dst_mtu(dst);
2551         struct net *net = dev_net(dev);
2552
2553         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2554
2555         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2556                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2557
2558         /*
2559          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2560          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2561          * IPV6_MAXPLEN is also valid and means: "any MSS,
2562          * rely only on pmtu discovery"
2563          */
2564         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2565                 mtu = IPV6_MAXPLEN;
2566         return mtu;
2567 }
2568
2569 static unsigned int ip6_mtu(const struct dst_entry *dst)
2570 {
2571         struct inet6_dev *idev;
2572         unsigned int mtu;
2573
2574         mtu = dst_metric_raw(dst, RTAX_MTU);
2575         if (mtu)
2576                 goto out;
2577
2578         mtu = IPV6_MIN_MTU;
2579
2580         rcu_read_lock();
2581         idev = __in6_dev_get(dst->dev);
2582         if (idev)
2583                 mtu = idev->cnf.mtu6;
2584         rcu_read_unlock();
2585
2586 out:
2587         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2588
2589         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2590 }
2591
2592 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2593                                   struct flowi6 *fl6)
2594 {
2595         struct dst_entry *dst;
2596         struct rt6_info *rt;
2597         struct inet6_dev *idev = in6_dev_get(dev);
2598         struct net *net = dev_net(dev);
2599
2600         if (unlikely(!idev))
2601                 return ERR_PTR(-ENODEV);
2602
2603         rt = ip6_dst_alloc(net, dev, 0);
2604         if (unlikely(!rt)) {
2605                 in6_dev_put(idev);
2606                 dst = ERR_PTR(-ENOMEM);
2607                 goto out;
2608         }
2609
2610         rt->dst.flags |= DST_HOST;
2611         rt->dst.input = ip6_input;
2612         rt->dst.output  = ip6_output;
2613         rt->rt6i_gateway  = fl6->daddr;
2614         rt->rt6i_dst.addr = fl6->daddr;
2615         rt->rt6i_dst.plen = 128;
2616         rt->rt6i_idev     = idev;
2617         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2618
2619         /* Add this dst into uncached_list so that rt6_disable_ip() can
2620          * do proper release of the net_device
2621          */
2622         rt6_uncached_list_add(rt);
2623         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2624
2625         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2626
2627 out:
2628         return dst;
2629 }
2630
2631 static int ip6_dst_gc(struct dst_ops *ops)
2632 {
2633         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2634         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2635         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2636         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2637         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2638         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2639         int entries;
2640
2641         entries = dst_entries_get_fast(ops);
2642         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2643             entries <= rt_max_size)
2644                 goto out;
2645
2646         net->ipv6.ip6_rt_gc_expire++;
2647         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2648         entries = dst_entries_get_slow(ops);
2649         if (entries < ops->gc_thresh)
2650                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2651 out:
2652         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2653         return entries > rt_max_size;
2654 }
2655
2656 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2657                                struct fib6_config *cfg)
2658 {
2659         struct dst_metrics *p;
2660
2661         if (!cfg->fc_mx)
2662                 return 0;
2663
2664         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2665         if (unlikely(!p))
2666                 return -ENOMEM;
2667
2668         refcount_set(&p->refcnt, 1);
2669         rt->fib6_metrics = p;
2670
2671         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2672 }
2673
2674 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2675                                             struct fib6_config *cfg,
2676                                             const struct in6_addr *gw_addr,
2677                                             u32 tbid, int flags)
2678 {
2679         struct flowi6 fl6 = {
2680                 .flowi6_oif = cfg->fc_ifindex,
2681                 .daddr = *gw_addr,
2682                 .saddr = cfg->fc_prefsrc,
2683         };
2684         struct fib6_table *table;
2685         struct rt6_info *rt;
2686
2687         table = fib6_get_table(net, tbid);
2688         if (!table)
2689                 return NULL;
2690
2691         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2692                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2693
2694         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2695         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2696
2697         /* if table lookup failed, fall back to full lookup */
2698         if (rt == net->ipv6.ip6_null_entry) {
2699                 ip6_rt_put(rt);
2700                 rt = NULL;
2701         }
2702
2703         return rt;
2704 }
2705
2706 static int ip6_route_check_nh_onlink(struct net *net,
2707                                      struct fib6_config *cfg,
2708                                      const struct net_device *dev,
2709                                      struct netlink_ext_ack *extack)
2710 {
2711         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2712         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2713         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2714         struct rt6_info *grt;
2715         int err;
2716
2717         err = 0;
2718         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2719         if (grt) {
2720                 if (!grt->dst.error &&
2721                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2722                         NL_SET_ERR_MSG(extack,
2723                                        "Nexthop has invalid gateway or device mismatch");
2724                         err = -EINVAL;
2725                 }
2726
2727                 ip6_rt_put(grt);
2728         }
2729
2730         return err;
2731 }
2732
2733 static int ip6_route_check_nh(struct net *net,
2734                               struct fib6_config *cfg,
2735                               struct net_device **_dev,
2736                               struct inet6_dev **idev)
2737 {
2738         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739         struct net_device *dev = _dev ? *_dev : NULL;
2740         struct rt6_info *grt = NULL;
2741         int err = -EHOSTUNREACH;
2742
2743         if (cfg->fc_table) {
2744                 int flags = RT6_LOOKUP_F_IFACE;
2745
2746                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2747                                           cfg->fc_table, flags);
2748                 if (grt) {
2749                         if (grt->rt6i_flags & RTF_GATEWAY ||
2750                             (dev && dev != grt->dst.dev)) {
2751                                 ip6_rt_put(grt);
2752                                 grt = NULL;
2753                         }
2754                 }
2755         }
2756
2757         if (!grt)
2758                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2759
2760         if (!grt)
2761                 goto out;
2762
2763         if (dev) {
2764                 if (dev != grt->dst.dev) {
2765                         ip6_rt_put(grt);
2766                         goto out;
2767                 }
2768         } else {
2769                 *_dev = dev = grt->dst.dev;
2770                 *idev = grt->rt6i_idev;
2771                 dev_hold(dev);
2772                 in6_dev_hold(grt->rt6i_idev);
2773         }
2774
2775         if (!(grt->rt6i_flags & RTF_GATEWAY))
2776                 err = 0;
2777
2778         ip6_rt_put(grt);
2779
2780 out:
2781         return err;
2782 }
2783
2784 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2785                            struct net_device **_dev, struct inet6_dev **idev,
2786                            struct netlink_ext_ack *extack)
2787 {
2788         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2789         int gwa_type = ipv6_addr_type(gw_addr);
2790         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2791         const struct net_device *dev = *_dev;
2792         bool need_addr_check = !dev;
2793         int err = -EINVAL;
2794
2795         /* if gw_addr is local we will fail to detect this in case
2796          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2797          * will return already-added prefix route via interface that
2798          * prefix route was assigned to, which might be non-loopback.
2799          */
2800         if (dev &&
2801             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2802                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2803                 goto out;
2804         }
2805
2806         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2807                 /* IPv6 strictly inhibits using not link-local
2808                  * addresses as nexthop address.
2809                  * Otherwise, router will not able to send redirects.
2810                  * It is very good, but in some (rare!) circumstances
2811                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2812                  * some exceptions. --ANK
2813                  * We allow IPv4-mapped nexthops to support RFC4798-type
2814                  * addressing
2815                  */
2816                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2817                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2818                         goto out;
2819                 }
2820
2821                 if (cfg->fc_flags & RTNH_F_ONLINK)
2822                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2823                 else
2824                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2825
2826                 if (err)
2827                         goto out;
2828         }
2829
2830         /* reload in case device was changed */
2831         dev = *_dev;
2832
2833         err = -EINVAL;
2834         if (!dev) {
2835                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2836                 goto out;
2837         } else if (dev->flags & IFF_LOOPBACK) {
2838                 NL_SET_ERR_MSG(extack,
2839                                "Egress device can not be loopback device for this route");
2840                 goto out;
2841         }
2842
2843         /* if we did not check gw_addr above, do so now that the
2844          * egress device has been resolved.
2845          */
2846         if (need_addr_check &&
2847             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2848                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2849                 goto out;
2850         }
2851
2852         err = 0;
2853 out:
2854         return err;
2855 }
2856
2857 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2858                                               gfp_t gfp_flags,
2859                                               struct netlink_ext_ack *extack)
2860 {
2861         struct net *net = cfg->fc_nlinfo.nl_net;
2862         struct fib6_info *rt = NULL;
2863         struct net_device *dev = NULL;
2864         struct inet6_dev *idev = NULL;
2865         struct fib6_table *table;
2866         int addr_type;
2867         int err = -EINVAL;
2868
2869         /* RTF_PCPU is an internal flag; can not be set by userspace */
2870         if (cfg->fc_flags & RTF_PCPU) {
2871                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2872                 goto out;
2873         }
2874
2875         /* RTF_CACHE is an internal flag; can not be set by userspace */
2876         if (cfg->fc_flags & RTF_CACHE) {
2877                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2878                 goto out;
2879         }
2880
2881         if (cfg->fc_type > RTN_MAX) {
2882                 NL_SET_ERR_MSG(extack, "Invalid route type");
2883                 goto out;
2884         }
2885
2886         if (cfg->fc_dst_len > 128) {
2887                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2888                 goto out;
2889         }
2890         if (cfg->fc_src_len > 128) {
2891                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2892                 goto out;
2893         }
2894 #ifndef CONFIG_IPV6_SUBTREES
2895         if (cfg->fc_src_len) {
2896                 NL_SET_ERR_MSG(extack,
2897                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2898                 goto out;
2899         }
2900 #endif
2901         if (cfg->fc_ifindex) {
2902                 err = -ENODEV;
2903                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2904                 if (!dev)
2905                         goto out;
2906                 idev = in6_dev_get(dev);
2907                 if (!idev)
2908                         goto out;
2909         }
2910
2911         if (cfg->fc_metric == 0)
2912                 cfg->fc_metric = IP6_RT_PRIO_USER;
2913
2914         if (cfg->fc_flags & RTNH_F_ONLINK) {
2915                 if (!dev) {
2916                         NL_SET_ERR_MSG(extack,
2917                                        "Nexthop device required for onlink");
2918                         err = -ENODEV;
2919                         goto out;
2920                 }
2921
2922                 if (!(dev->flags & IFF_UP)) {
2923                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2924                         err = -ENETDOWN;
2925                         goto out;
2926                 }
2927         }
2928
2929         err = -ENOBUFS;
2930         if (cfg->fc_nlinfo.nlh &&
2931             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2932                 table = fib6_get_table(net, cfg->fc_table);
2933                 if (!table) {
2934                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2935                         table = fib6_new_table(net, cfg->fc_table);
2936                 }
2937         } else {
2938                 table = fib6_new_table(net, cfg->fc_table);
2939         }
2940
2941         if (!table)
2942                 goto out;
2943
2944         err = -ENOMEM;
2945         rt = fib6_info_alloc(gfp_flags);
2946         if (!rt)
2947                 goto out;
2948
2949         if (cfg->fc_flags & RTF_ADDRCONF)
2950                 rt->dst_nocount = true;
2951
2952         err = ip6_convert_metrics(net, rt, cfg);
2953         if (err < 0)
2954                 goto out;
2955
2956         if (cfg->fc_flags & RTF_EXPIRES)
2957                 fib6_set_expires(rt, jiffies +
2958                                 clock_t_to_jiffies(cfg->fc_expires));
2959         else
2960                 fib6_clean_expires(rt);
2961
2962         if (cfg->fc_protocol == RTPROT_UNSPEC)
2963                 cfg->fc_protocol = RTPROT_BOOT;
2964         rt->fib6_protocol = cfg->fc_protocol;
2965
2966         addr_type = ipv6_addr_type(&cfg->fc_dst);
2967
2968         if (cfg->fc_encap) {
2969                 struct lwtunnel_state *lwtstate;
2970
2971                 err = lwtunnel_build_state(cfg->fc_encap_type,
2972                                            cfg->fc_encap, AF_INET6, cfg,
2973                                            &lwtstate, extack);
2974                 if (err)
2975                         goto out;
2976                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2977         }
2978
2979         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2980         rt->fib6_dst.plen = cfg->fc_dst_len;
2981         if (rt->fib6_dst.plen == 128)
2982                 rt->dst_host = true;
2983
2984 #ifdef CONFIG_IPV6_SUBTREES
2985         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2986         rt->fib6_src.plen = cfg->fc_src_len;
2987 #endif
2988
2989         rt->fib6_metric = cfg->fc_metric;
2990         rt->fib6_nh.nh_weight = 1;
2991
2992         rt->fib6_type = cfg->fc_type;
2993
2994         /* We cannot add true routes via loopback here,
2995            they would result in kernel looping; promote them to reject routes
2996          */
2997         if ((cfg->fc_flags & RTF_REJECT) ||
2998             (dev && (dev->flags & IFF_LOOPBACK) &&
2999              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3000              !(cfg->fc_flags & RTF_LOCAL))) {
3001                 /* hold loopback dev/idev if we haven't done so. */
3002                 if (dev != net->loopback_dev) {
3003                         if (dev) {
3004                                 dev_put(dev);
3005                                 in6_dev_put(idev);
3006                         }
3007                         dev = net->loopback_dev;
3008                         dev_hold(dev);
3009                         idev = in6_dev_get(dev);
3010                         if (!idev) {
3011                                 err = -ENODEV;
3012                                 goto out;
3013                         }
3014                 }
3015                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3016                 goto install_route;
3017         }
3018
3019         if (cfg->fc_flags & RTF_GATEWAY) {
3020                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3021                 if (err)
3022                         goto out;
3023
3024                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3025         }
3026
3027         err = -ENODEV;
3028         if (!dev)
3029                 goto out;
3030
3031         if (idev->cnf.disable_ipv6) {
3032                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3033                 err = -EACCES;
3034                 goto out;
3035         }
3036
3037         if (!(dev->flags & IFF_UP)) {
3038                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3039                 err = -ENETDOWN;
3040                 goto out;
3041         }
3042
3043         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3044                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3045                         NL_SET_ERR_MSG(extack, "Invalid source address");
3046                         err = -EINVAL;
3047                         goto out;
3048                 }
3049                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3050                 rt->fib6_prefsrc.plen = 128;
3051         } else
3052                 rt->fib6_prefsrc.plen = 0;
3053
3054         rt->fib6_flags = cfg->fc_flags;
3055
3056 install_route:
3057         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058             !netif_carrier_ok(dev))
3059                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3060         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3061         rt->fib6_nh.nh_dev = dev;
3062         rt->fib6_table = table;
3063
3064         cfg->fc_nlinfo.nl_net = dev_net(dev);
3065
3066         if (idev)
3067                 in6_dev_put(idev);
3068
3069         return rt;
3070 out:
3071         if (dev)
3072                 dev_put(dev);
3073         if (idev)
3074                 in6_dev_put(idev);
3075
3076         fib6_info_release(rt);
3077         return ERR_PTR(err);
3078 }
3079
3080 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3081                   struct netlink_ext_ack *extack)
3082 {
3083         struct fib6_info *rt;
3084         int err;
3085
3086         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3087         if (IS_ERR(rt))
3088                 return PTR_ERR(rt);
3089
3090         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3091         fib6_info_release(rt);
3092
3093         return err;
3094 }
3095
3096 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3097 {
3098         struct net *net = info->nl_net;
3099         struct fib6_table *table;
3100         int err;
3101
3102         if (rt == net->ipv6.fib6_null_entry) {
3103                 err = -ENOENT;
3104                 goto out;
3105         }
3106
3107         table = rt->fib6_table;
3108         spin_lock_bh(&table->tb6_lock);
3109         err = fib6_del(rt, info);
3110         spin_unlock_bh(&table->tb6_lock);
3111
3112 out:
3113         fib6_info_release(rt);
3114         return err;
3115 }
3116
3117 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3118 {
3119         struct nl_info info = { .nl_net = net };
3120
3121         return __ip6_del_rt(rt, &info);
3122 }
3123
3124 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3125 {
3126         struct nl_info *info = &cfg->fc_nlinfo;
3127         struct net *net = info->nl_net;
3128         struct sk_buff *skb = NULL;
3129         struct fib6_table *table;
3130         int err = -ENOENT;
3131
3132         if (rt == net->ipv6.fib6_null_entry)
3133                 goto out_put;
3134         table = rt->fib6_table;
3135         spin_lock_bh(&table->tb6_lock);
3136
3137         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3138                 struct fib6_info *sibling, *next_sibling;
3139
3140                 /* prefer to send a single notification with all hops */
3141                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3142                 if (skb) {
3143                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3144
3145                         if (rt6_fill_node(net, skb, rt, NULL,
3146                                           NULL, NULL, 0, RTM_DELROUTE,
3147                                           info->portid, seq, 0) < 0) {
3148                                 kfree_skb(skb);
3149                                 skb = NULL;
3150                         } else
3151                                 info->skip_notify = 1;
3152                 }
3153
3154                 list_for_each_entry_safe(sibling, next_sibling,
3155                                          &rt->fib6_siblings,
3156                                          fib6_siblings) {
3157                         err = fib6_del(sibling, info);
3158                         if (err)
3159                                 goto out_unlock;
3160                 }
3161         }
3162
3163         err = fib6_del(rt, info);
3164 out_unlock:
3165         spin_unlock_bh(&table->tb6_lock);
3166 out_put:
3167         fib6_info_release(rt);
3168
3169         if (skb) {
3170                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3171                             info->nlh, gfp_any());
3172         }
3173         return err;
3174 }
3175
3176 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3177 {
3178         int rc = -ESRCH;
3179
3180         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3181                 goto out;
3182
3183         if (cfg->fc_flags & RTF_GATEWAY &&
3184             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3185                 goto out;
3186         if (dst_hold_safe(&rt->dst))
3187                 rc = rt6_remove_exception_rt(rt);
3188 out:
3189         return rc;
3190 }
3191
3192 static int ip6_route_del(struct fib6_config *cfg,
3193                          struct netlink_ext_ack *extack)
3194 {
3195         struct rt6_info *rt_cache;
3196         struct fib6_table *table;
3197         struct fib6_info *rt;
3198         struct fib6_node *fn;
3199         int err = -ESRCH;
3200
3201         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3202         if (!table) {
3203                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3204                 return err;
3205         }
3206
3207         rcu_read_lock();
3208
3209         fn = fib6_locate(&table->tb6_root,
3210                          &cfg->fc_dst, cfg->fc_dst_len,
3211                          &cfg->fc_src, cfg->fc_src_len,
3212                          !(cfg->fc_flags & RTF_CACHE));
3213
3214         if (fn) {
3215                 for_each_fib6_node_rt_rcu(fn) {
3216                         if (cfg->fc_flags & RTF_CACHE) {
3217                                 int rc;
3218
3219                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3220                                                               &cfg->fc_src);
3221                                 if (rt_cache) {
3222                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3223                                         if (rc != -ESRCH)
3224                                                 return rc;
3225                                 }
3226                                 continue;
3227                         }
3228                         if (cfg->fc_ifindex &&
3229                             (!rt->fib6_nh.nh_dev ||
3230                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3231                                 continue;
3232                         if (cfg->fc_flags & RTF_GATEWAY &&
3233                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3234                                 continue;
3235                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3236                                 continue;
3237                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3238                                 continue;
3239                         fib6_info_hold(rt);
3240                         rcu_read_unlock();
3241
3242                         /* if gateway was specified only delete the one hop */
3243                         if (cfg->fc_flags & RTF_GATEWAY)
3244                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3245
3246                         return __ip6_del_rt_siblings(rt, cfg);
3247                 }
3248         }
3249         rcu_read_unlock();
3250
3251         return err;
3252 }
3253
3254 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3255 {
3256         struct netevent_redirect netevent;
3257         struct rt6_info *rt, *nrt = NULL;
3258         struct ndisc_options ndopts;
3259         struct inet6_dev *in6_dev;
3260         struct neighbour *neigh;
3261         struct fib6_info *from;
3262         struct rd_msg *msg;
3263         int optlen, on_link;
3264         u8 *lladdr;
3265
3266         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3267         optlen -= sizeof(*msg);
3268
3269         if (optlen < 0) {
3270                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3271                 return;
3272         }
3273
3274         msg = (struct rd_msg *)icmp6_hdr(skb);
3275
3276         if (ipv6_addr_is_multicast(&msg->dest)) {
3277                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3278                 return;
3279         }
3280
3281         on_link = 0;
3282         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3283                 on_link = 1;
3284         } else if (ipv6_addr_type(&msg->target) !=
3285                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3286                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3287                 return;
3288         }
3289
3290         in6_dev = __in6_dev_get(skb->dev);
3291         if (!in6_dev)
3292                 return;
3293         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3294                 return;
3295
3296         /* RFC2461 8.1:
3297          *      The IP source address of the Redirect MUST be the same as the current
3298          *      first-hop router for the specified ICMP Destination Address.
3299          */
3300
3301         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3302                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3303                 return;
3304         }
3305
3306         lladdr = NULL;
3307         if (ndopts.nd_opts_tgt_lladdr) {
3308                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3309                                              skb->dev);
3310                 if (!lladdr) {
3311                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3312                         return;
3313                 }
3314         }
3315
3316         rt = (struct rt6_info *) dst;
3317         if (rt->rt6i_flags & RTF_REJECT) {
3318                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3319                 return;
3320         }
3321
3322         /* Redirect received -> path was valid.
3323          * Look, redirects are sent only in response to data packets,
3324          * so that this nexthop apparently is reachable. --ANK
3325          */
3326         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3327
3328         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3329         if (!neigh)
3330                 return;
3331
3332         /*
3333          *      We have finally decided to accept it.
3334          */
3335
3336         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3337                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3338                      NEIGH_UPDATE_F_OVERRIDE|
3339                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3340                                      NEIGH_UPDATE_F_ISROUTER)),
3341                      NDISC_REDIRECT, &ndopts);
3342
3343         rcu_read_lock();
3344         from = rcu_dereference(rt->from);
3345         fib6_info_hold(from);
3346         rcu_read_unlock();
3347
3348         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3349         if (!nrt)
3350                 goto out;
3351
3352         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3353         if (on_link)
3354                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3355
3356         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3357
3358         /* No need to remove rt from the exception table if rt is
3359          * a cached route because rt6_insert_exception() will
3360          * takes care of it
3361          */
3362         if (rt6_insert_exception(nrt, from)) {
3363                 dst_release_immediate(&nrt->dst);
3364                 goto out;
3365         }
3366
3367         netevent.old = &rt->dst;
3368         netevent.new = &nrt->dst;
3369         netevent.daddr = &msg->dest;
3370         netevent.neigh = neigh;
3371         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3372
3373 out:
3374         fib6_info_release(from);
3375         neigh_release(neigh);
3376 }
3377
3378 #ifdef CONFIG_IPV6_ROUTE_INFO
3379 static struct fib6_info *rt6_get_route_info(struct net *net,
3380                                            const struct in6_addr *prefix, int prefixlen,
3381                                            const struct in6_addr *gwaddr,
3382                                            struct net_device *dev)
3383 {
3384         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3385         int ifindex = dev->ifindex;
3386         struct fib6_node *fn;
3387         struct fib6_info *rt = NULL;
3388         struct fib6_table *table;
3389
3390         table = fib6_get_table(net, tb_id);
3391         if (!table)
3392                 return NULL;
3393
3394         rcu_read_lock();
3395         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3396         if (!fn)
3397                 goto out;
3398
3399         for_each_fib6_node_rt_rcu(fn) {
3400                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3401                         continue;
3402                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3403                         continue;
3404                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3405                         continue;
3406                 fib6_info_hold(rt);
3407                 break;
3408         }
3409 out:
3410         rcu_read_unlock();
3411         return rt;
3412 }
3413
3414 static struct fib6_info *rt6_add_route_info(struct net *net,
3415                                            const struct in6_addr *prefix, int prefixlen,
3416                                            const struct in6_addr *gwaddr,
3417                                            struct net_device *dev,
3418                                            unsigned int pref)
3419 {
3420         struct fib6_config cfg = {
3421                 .fc_metric      = IP6_RT_PRIO_USER,
3422                 .fc_ifindex     = dev->ifindex,
3423                 .fc_dst_len     = prefixlen,
3424                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3425                                   RTF_UP | RTF_PREF(pref),
3426                 .fc_protocol = RTPROT_RA,
3427                 .fc_type = RTN_UNICAST,
3428                 .fc_nlinfo.portid = 0,
3429                 .fc_nlinfo.nlh = NULL,
3430                 .fc_nlinfo.nl_net = net,
3431         };
3432
3433         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3434         cfg.fc_dst = *prefix;
3435         cfg.fc_gateway = *gwaddr;
3436
3437         /* We should treat it as a default route if prefix length is 0. */
3438         if (!prefixlen)
3439                 cfg.fc_flags |= RTF_DEFAULT;
3440
3441         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3442
3443         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3444 }
3445 #endif
3446
3447 struct fib6_info *rt6_get_dflt_router(struct net *net,
3448                                      const struct in6_addr *addr,
3449                                      struct net_device *dev)
3450 {
3451         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3452         struct fib6_info *rt;
3453         struct fib6_table *table;
3454
3455         table = fib6_get_table(net, tb_id);
3456         if (!table)
3457                 return NULL;
3458
3459         rcu_read_lock();
3460         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3461                 if (dev == rt->fib6_nh.nh_dev &&
3462                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3463                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3464                         break;
3465         }
3466         if (rt)
3467                 fib6_info_hold(rt);
3468         rcu_read_unlock();
3469         return rt;
3470 }
3471
3472 struct fib6_info *rt6_add_dflt_router(struct net *net,
3473                                      const struct in6_addr *gwaddr,
3474                                      struct net_device *dev,
3475                                      unsigned int pref)
3476 {
3477         struct fib6_config cfg = {
3478                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3479                 .fc_metric      = IP6_RT_PRIO_USER,
3480                 .fc_ifindex     = dev->ifindex,
3481                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3482                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3483                 .fc_protocol = RTPROT_RA,
3484                 .fc_type = RTN_UNICAST,
3485                 .fc_nlinfo.portid = 0,
3486                 .fc_nlinfo.nlh = NULL,
3487                 .fc_nlinfo.nl_net = net,
3488         };
3489
3490         cfg.fc_gateway = *gwaddr;
3491
3492         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3493                 struct fib6_table *table;
3494
3495                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3496                 if (table)
3497                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3498         }
3499
3500         return rt6_get_dflt_router(net, gwaddr, dev);
3501 }
3502
3503 static void __rt6_purge_dflt_routers(struct net *net,
3504                                      struct fib6_table *table)
3505 {
3506         struct fib6_info *rt;
3507
3508 restart:
3509         rcu_read_lock();
3510         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3511                 struct net_device *dev = fib6_info_nh_dev(rt);
3512                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3513
3514                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3515                     (!idev || idev->cnf.accept_ra != 2)) {
3516                         fib6_info_hold(rt);
3517                         rcu_read_unlock();
3518                         ip6_del_rt(net, rt);
3519                         goto restart;
3520                 }
3521         }
3522         rcu_read_unlock();
3523
3524         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3525 }
3526
3527 void rt6_purge_dflt_routers(struct net *net)
3528 {
3529         struct fib6_table *table;
3530         struct hlist_head *head;
3531         unsigned int h;
3532
3533         rcu_read_lock();
3534
3535         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3536                 head = &net->ipv6.fib_table_hash[h];
3537                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3538                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3539                                 __rt6_purge_dflt_routers(net, table);
3540                 }
3541         }
3542
3543         rcu_read_unlock();
3544 }
3545
3546 static void rtmsg_to_fib6_config(struct net *net,
3547                                  struct in6_rtmsg *rtmsg,
3548                                  struct fib6_config *cfg)
3549 {
3550         memset(cfg, 0, sizeof(*cfg));
3551
3552         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3553                          : RT6_TABLE_MAIN;
3554         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3555         cfg->fc_metric = rtmsg->rtmsg_metric;
3556         cfg->fc_expires = rtmsg->rtmsg_info;
3557         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3558         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3559         cfg->fc_flags = rtmsg->rtmsg_flags;
3560         cfg->fc_type = rtmsg->rtmsg_type;
3561
3562         cfg->fc_nlinfo.nl_net = net;
3563
3564         cfg->fc_dst = rtmsg->rtmsg_dst;
3565         cfg->fc_src = rtmsg->rtmsg_src;
3566         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3567 }
3568
3569 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3570 {
3571         struct fib6_config cfg;
3572         struct in6_rtmsg rtmsg;
3573         int err;
3574
3575         switch (cmd) {
3576         case SIOCADDRT:         /* Add a route */
3577         case SIOCDELRT:         /* Delete a route */
3578                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3579                         return -EPERM;
3580                 err = copy_from_user(&rtmsg, arg,
3581                                      sizeof(struct in6_rtmsg));
3582                 if (err)
3583                         return -EFAULT;
3584
3585                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3586
3587                 rtnl_lock();
3588                 switch (cmd) {
3589                 case SIOCADDRT:
3590                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3591                         break;
3592                 case SIOCDELRT:
3593                         err = ip6_route_del(&cfg, NULL);
3594                         break;
3595                 default:
3596                         err = -EINVAL;
3597                 }
3598                 rtnl_unlock();
3599
3600                 return err;
3601         }
3602
3603         return -EINVAL;
3604 }
3605
3606 /*
3607  *      Drop the packet on the floor
3608  */
3609
3610 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3611 {
3612         int type;
3613         struct dst_entry *dst = skb_dst(skb);
3614         switch (ipstats_mib_noroutes) {
3615         case IPSTATS_MIB_INNOROUTES:
3616                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3617                 if (type == IPV6_ADDR_ANY) {
3618                         IP6_INC_STATS(dev_net(dst->dev),
3619                                       __in6_dev_get_safely(skb->dev),
3620                                       IPSTATS_MIB_INADDRERRORS);
3621                         break;
3622                 }
3623                 /* FALLTHROUGH */
3624         case IPSTATS_MIB_OUTNOROUTES:
3625                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3626                               ipstats_mib_noroutes);
3627                 break;
3628         }
3629         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3630         kfree_skb(skb);
3631         return 0;
3632 }
3633
3634 static int ip6_pkt_discard(struct sk_buff *skb)
3635 {
3636         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3637 }
3638
3639 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3640 {
3641         skb->dev = skb_dst(skb)->dev;
3642         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3643 }
3644
3645 static int ip6_pkt_prohibit(struct sk_buff *skb)
3646 {
3647         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3648 }
3649
3650 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3651 {
3652         skb->dev = skb_dst(skb)->dev;
3653         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3654 }
3655
3656 /*
3657  *      Allocate a dst for local (unicast / anycast) address.
3658  */
3659
3660 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3661                                      struct inet6_dev *idev,
3662                                      const struct in6_addr *addr,
3663                                      bool anycast, gfp_t gfp_flags)
3664 {
3665         u32 tb_id;
3666         struct net_device *dev = idev->dev;
3667         struct fib6_info *f6i;
3668
3669         f6i = fib6_info_alloc(gfp_flags);
3670         if (!f6i)
3671                 return ERR_PTR(-ENOMEM);
3672
3673         f6i->dst_nocount = true;
3674         f6i->dst_host = true;
3675         f6i->fib6_protocol = RTPROT_KERNEL;
3676         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3677         if (anycast) {
3678                 f6i->fib6_type = RTN_ANYCAST;
3679                 f6i->fib6_flags |= RTF_ANYCAST;
3680         } else {
3681                 f6i->fib6_type = RTN_LOCAL;
3682                 f6i->fib6_flags |= RTF_LOCAL;
3683         }
3684
3685         f6i->fib6_nh.nh_gw = *addr;
3686         dev_hold(dev);
3687         f6i->fib6_nh.nh_dev = dev;
3688         f6i->fib6_dst.addr = *addr;
3689         f6i->fib6_dst.plen = 128;
3690         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3691         f6i->fib6_table = fib6_get_table(net, tb_id);
3692
3693         return f6i;
3694 }
3695
3696 /* remove deleted ip from prefsrc entries */
3697 struct arg_dev_net_ip {
3698         struct net_device *dev;
3699         struct net *net;
3700         struct in6_addr *addr;
3701 };
3702
3703 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3704 {
3705         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3706         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3707         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3708
3709         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3710             rt != net->ipv6.fib6_null_entry &&
3711             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3712                 spin_lock_bh(&rt6_exception_lock);
3713                 /* remove prefsrc entry */
3714                 rt->fib6_prefsrc.plen = 0;
3715                 /* need to update cache as well */
3716                 rt6_exceptions_remove_prefsrc(rt);
3717                 spin_unlock_bh(&rt6_exception_lock);
3718         }
3719         return 0;
3720 }
3721
3722 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3723 {
3724         struct net *net = dev_net(ifp->idev->dev);
3725         struct arg_dev_net_ip adni = {
3726                 .dev = ifp->idev->dev,
3727                 .net = net,
3728                 .addr = &ifp->addr,
3729         };
3730         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3731 }
3732
3733 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3734
3735 /* Remove routers and update dst entries when gateway turn into host. */
3736 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3737 {
3738         struct in6_addr *gateway = (struct in6_addr *)arg;
3739
3740         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3741             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3742                 return -1;
3743         }
3744
3745         /* Further clean up cached routes in exception table.
3746          * This is needed because cached route may have a different
3747          * gateway than its 'parent' in the case of an ip redirect.
3748          */
3749         rt6_exceptions_clean_tohost(rt, gateway);
3750
3751         return 0;
3752 }
3753
3754 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3755 {
3756         fib6_clean_all(net, fib6_clean_tohost, gateway);
3757 }
3758
3759 struct arg_netdev_event {
3760         const struct net_device *dev;
3761         union {
3762                 unsigned int nh_flags;
3763                 unsigned long event;
3764         };
3765 };
3766
3767 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3768 {
3769         struct fib6_info *iter;
3770         struct fib6_node *fn;
3771
3772         fn = rcu_dereference_protected(rt->fib6_node,
3773                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3774         iter = rcu_dereference_protected(fn->leaf,
3775                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3776         while (iter) {
3777                 if (iter->fib6_metric == rt->fib6_metric &&
3778                     rt6_qualify_for_ecmp(iter))
3779                         return iter;
3780                 iter = rcu_dereference_protected(iter->rt6_next,
3781                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3782         }
3783
3784         return NULL;
3785 }
3786
3787 static bool rt6_is_dead(const struct fib6_info *rt)
3788 {
3789         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3790             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3791              fib6_ignore_linkdown(rt)))
3792                 return true;
3793
3794         return false;
3795 }
3796
3797 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3798 {
3799         struct fib6_info *iter;
3800         int total = 0;
3801
3802         if (!rt6_is_dead(rt))
3803                 total += rt->fib6_nh.nh_weight;
3804
3805         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3806                 if (!rt6_is_dead(iter))
3807                         total += iter->fib6_nh.nh_weight;
3808         }
3809
3810         return total;
3811 }
3812
3813 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3814 {
3815         int upper_bound = -1;
3816
3817         if (!rt6_is_dead(rt)) {
3818                 *weight += rt->fib6_nh.nh_weight;
3819                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3820                                                     total) - 1;
3821         }
3822         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3823 }
3824
3825 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3826 {
3827         struct fib6_info *iter;
3828         int weight = 0;
3829
3830         rt6_upper_bound_set(rt, &weight, total);
3831
3832         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3833                 rt6_upper_bound_set(iter, &weight, total);
3834 }
3835
3836 void rt6_multipath_rebalance(struct fib6_info *rt)
3837 {
3838         struct fib6_info *first;
3839         int total;
3840
3841         /* In case the entire multipath route was marked for flushing,
3842          * then there is no need to rebalance upon the removal of every
3843          * sibling route.
3844          */
3845         if (!rt->fib6_nsiblings || rt->should_flush)
3846                 return;
3847
3848         /* During lookup routes are evaluated in order, so we need to
3849          * make sure upper bounds are assigned from the first sibling
3850          * onwards.
3851          */
3852         first = rt6_multipath_first_sibling(rt);
3853         if (WARN_ON_ONCE(!first))
3854                 return;
3855
3856         total = rt6_multipath_total_weight(first);
3857         rt6_multipath_upper_bound_set(first, total);
3858 }
3859
3860 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3861 {
3862         const struct arg_netdev_event *arg = p_arg;
3863         struct net *net = dev_net(arg->dev);
3864
3865         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3866                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3867                 fib6_update_sernum_upto_root(net, rt);
3868                 rt6_multipath_rebalance(rt);
3869         }
3870
3871         return 0;
3872 }
3873
3874 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3875 {
3876         struct arg_netdev_event arg = {
3877                 .dev = dev,
3878                 {
3879                         .nh_flags = nh_flags,
3880                 },
3881         };
3882
3883         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3884                 arg.nh_flags |= RTNH_F_LINKDOWN;
3885
3886         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3887 }
3888
3889 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3890                                    const struct net_device *dev)
3891 {
3892         struct fib6_info *iter;
3893
3894         if (rt->fib6_nh.nh_dev == dev)
3895                 return true;
3896         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3897                 if (iter->fib6_nh.nh_dev == dev)
3898                         return true;
3899
3900         return false;
3901 }
3902
3903 static void rt6_multipath_flush(struct fib6_info *rt)
3904 {
3905         struct fib6_info *iter;
3906
3907         rt->should_flush = 1;
3908         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3909                 iter->should_flush = 1;
3910 }
3911
3912 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3913                                              const struct net_device *down_dev)
3914 {
3915         struct fib6_info *iter;
3916         unsigned int dead = 0;
3917
3918         if (rt->fib6_nh.nh_dev == down_dev ||
3919             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3920                 dead++;
3921         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3922                 if (iter->fib6_nh.nh_dev == down_dev ||
3923                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3924                         dead++;
3925
3926         return dead;
3927 }
3928
3929 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3930                                        const struct net_device *dev,
3931                                        unsigned int nh_flags)
3932 {
3933         struct fib6_info *iter;
3934
3935         if (rt->fib6_nh.nh_dev == dev)
3936                 rt->fib6_nh.nh_flags |= nh_flags;
3937         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3938                 if (iter->fib6_nh.nh_dev == dev)
3939                         iter->fib6_nh.nh_flags |= nh_flags;
3940 }
3941
3942 /* called with write lock held for table with rt */
3943 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3944 {
3945         const struct arg_netdev_event *arg = p_arg;
3946         const struct net_device *dev = arg->dev;
3947         struct net *net = dev_net(dev);
3948
3949         if (rt == net->ipv6.fib6_null_entry)
3950                 return 0;
3951
3952         switch (arg->event) {
3953         case NETDEV_UNREGISTER:
3954                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3955         case NETDEV_DOWN:
3956                 if (rt->should_flush)
3957                         return -1;
3958                 if (!rt->fib6_nsiblings)
3959                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3960                 if (rt6_multipath_uses_dev(rt, dev)) {
3961                         unsigned int count;
3962
3963                         count = rt6_multipath_dead_count(rt, dev);
3964                         if (rt->fib6_nsiblings + 1 == count) {
3965                                 rt6_multipath_flush(rt);
3966                                 return -1;
3967                         }
3968                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3969                                                    RTNH_F_LINKDOWN);
3970                         fib6_update_sernum(net, rt);
3971                         rt6_multipath_rebalance(rt);
3972                 }
3973                 return -2;
3974         case NETDEV_CHANGE:
3975                 if (rt->fib6_nh.nh_dev != dev ||
3976                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3977                         break;
3978                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3979                 rt6_multipath_rebalance(rt);
3980                 break;
3981         }
3982
3983         return 0;
3984 }
3985
3986 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3987 {
3988         struct arg_netdev_event arg = {
3989                 .dev = dev,
3990                 {
3991                         .event = event,
3992                 },
3993         };
3994
3995         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3996 }
3997
3998 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3999 {
4000         rt6_sync_down_dev(dev, event);
4001         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4002         neigh_ifdown(&nd_tbl, dev);
4003 }
4004
4005 struct rt6_mtu_change_arg {
4006         struct net_device *dev;
4007         unsigned int mtu;
4008 };
4009
4010 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4011 {
4012         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4013         struct inet6_dev *idev;
4014
4015         /* In IPv6 pmtu discovery is not optional,
4016            so that RTAX_MTU lock cannot disable it.
4017            We still use this lock to block changes
4018            caused by addrconf/ndisc.
4019         */
4020
4021         idev = __in6_dev_get(arg->dev);
4022         if (!idev)
4023                 return 0;
4024
4025         /* For administrative MTU increase, there is no way to discover
4026            IPv6 PMTU increase, so PMTU increase should be updated here.
4027            Since RFC 1981 doesn't include administrative MTU increase
4028            update PMTU increase is a MUST. (i.e. jumbo frame)
4029          */
4030         if (rt->fib6_nh.nh_dev == arg->dev &&
4031             !fib6_metric_locked(rt, RTAX_MTU)) {
4032                 u32 mtu = rt->fib6_pmtu;
4033
4034                 if (mtu >= arg->mtu ||
4035                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4036                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4037
4038                 spin_lock_bh(&rt6_exception_lock);
4039                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4040                 spin_unlock_bh(&rt6_exception_lock);
4041         }
4042         return 0;
4043 }
4044
4045 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4046 {
4047         struct rt6_mtu_change_arg arg = {
4048                 .dev = dev,
4049                 .mtu = mtu,
4050         };
4051
4052         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4053 }
4054
4055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4056         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4057         [RTA_OIF]               = { .type = NLA_U32 },
4058         [RTA_IIF]               = { .type = NLA_U32 },
4059         [RTA_PRIORITY]          = { .type = NLA_U32 },
4060         [RTA_METRICS]           = { .type = NLA_NESTED },
4061         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4062         [RTA_PREF]              = { .type = NLA_U8 },
4063         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4064         [RTA_ENCAP]             = { .type = NLA_NESTED },
4065         [RTA_EXPIRES]           = { .type = NLA_U32 },
4066         [RTA_UID]               = { .type = NLA_U32 },
4067         [RTA_MARK]              = { .type = NLA_U32 },
4068 };
4069
4070 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4071                               struct fib6_config *cfg,
4072                               struct netlink_ext_ack *extack)
4073 {
4074         struct rtmsg *rtm;
4075         struct nlattr *tb[RTA_MAX+1];
4076         unsigned int pref;
4077         int err;
4078
4079         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4080                           NULL);
4081         if (err < 0)
4082                 goto errout;
4083
4084         err = -EINVAL;
4085         rtm = nlmsg_data(nlh);
4086         memset(cfg, 0, sizeof(*cfg));
4087
4088         cfg->fc_table = rtm->rtm_table;
4089         cfg->fc_dst_len = rtm->rtm_dst_len;
4090         cfg->fc_src_len = rtm->rtm_src_len;
4091         cfg->fc_flags = RTF_UP;
4092         cfg->fc_protocol = rtm->rtm_protocol;
4093         cfg->fc_type = rtm->rtm_type;
4094
4095         if (rtm->rtm_type == RTN_UNREACHABLE ||
4096             rtm->rtm_type == RTN_BLACKHOLE ||
4097             rtm->rtm_type == RTN_PROHIBIT ||
4098             rtm->rtm_type == RTN_THROW)
4099                 cfg->fc_flags |= RTF_REJECT;
4100
4101         if (rtm->rtm_type == RTN_LOCAL)
4102                 cfg->fc_flags |= RTF_LOCAL;
4103
4104         if (rtm->rtm_flags & RTM_F_CLONED)
4105                 cfg->fc_flags |= RTF_CACHE;
4106
4107         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4108
4109         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4110         cfg->fc_nlinfo.nlh = nlh;
4111         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4112
4113         if (tb[RTA_GATEWAY]) {
4114                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4115                 cfg->fc_flags |= RTF_GATEWAY;
4116         }
4117
4118         if (tb[RTA_DST]) {
4119                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4120
4121                 if (nla_len(tb[RTA_DST]) < plen)
4122                         goto errout;
4123
4124                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4125         }
4126
4127         if (tb[RTA_SRC]) {
4128                 int plen = (rtm->rtm_src_len + 7) >> 3;
4129
4130                 if (nla_len(tb[RTA_SRC]) < plen)
4131                         goto errout;
4132
4133                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4134         }
4135
4136         if (tb[RTA_PREFSRC])
4137                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4138
4139         if (tb[RTA_OIF])
4140                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4141
4142         if (tb[RTA_PRIORITY])
4143                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4144
4145         if (tb[RTA_METRICS]) {
4146                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4147                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4148         }
4149
4150         if (tb[RTA_TABLE])
4151                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4152
4153         if (tb[RTA_MULTIPATH]) {
4154                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4155                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4156
4157                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4158                                                      cfg->fc_mp_len, extack);
4159                 if (err < 0)
4160                         goto errout;
4161         }
4162
4163         if (tb[RTA_PREF]) {
4164                 pref = nla_get_u8(tb[RTA_PREF]);
4165                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4166                     pref != ICMPV6_ROUTER_PREF_HIGH)
4167                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4168                 cfg->fc_flags |= RTF_PREF(pref);
4169         }
4170
4171         if (tb[RTA_ENCAP])
4172                 cfg->fc_encap = tb[RTA_ENCAP];
4173
4174         if (tb[RTA_ENCAP_TYPE]) {
4175                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4176
4177                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4178                 if (err < 0)
4179                         goto errout;
4180         }
4181
4182         if (tb[RTA_EXPIRES]) {
4183                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4184
4185                 if (addrconf_finite_timeout(timeout)) {
4186                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4187                         cfg->fc_flags |= RTF_EXPIRES;
4188                 }
4189         }
4190
4191         err = 0;
4192 errout:
4193         return err;
4194 }
4195
4196 struct rt6_nh {
4197         struct fib6_info *fib6_info;
4198         struct fib6_config r_cfg;
4199         struct list_head next;
4200 };
4201
4202 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4203 {
4204         struct rt6_nh *nh;
4205
4206         list_for_each_entry(nh, rt6_nh_list, next) {
4207                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4208                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4209                         nh->r_cfg.fc_ifindex);
4210         }
4211 }
4212
4213 static int ip6_route_info_append(struct net *net,
4214                                  struct list_head *rt6_nh_list,
4215                                  struct fib6_info *rt,
4216                                  struct fib6_config *r_cfg)
4217 {
4218         struct rt6_nh *nh;
4219         int err = -EEXIST;
4220
4221         list_for_each_entry(nh, rt6_nh_list, next) {
4222                 /* check if fib6_info already exists */
4223                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4224                         return err;
4225         }
4226
4227         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4228         if (!nh)
4229                 return -ENOMEM;
4230         nh->fib6_info = rt;
4231         err = ip6_convert_metrics(net, rt, r_cfg);
4232         if (err) {
4233                 kfree(nh);
4234                 return err;
4235         }
4236         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4237         list_add_tail(&nh->next, rt6_nh_list);
4238
4239         return 0;
4240 }
4241
4242 static void ip6_route_mpath_notify(struct fib6_info *rt,
4243                                    struct fib6_info *rt_last,
4244                                    struct nl_info *info,
4245                                    __u16 nlflags)
4246 {
4247         /* if this is an APPEND route, then rt points to the first route
4248          * inserted and rt_last points to last route inserted. Userspace
4249          * wants a consistent dump of the route which starts at the first
4250          * nexthop. Since sibling routes are always added at the end of
4251          * the list, find the first sibling of the last route appended
4252          */
4253         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4254                 rt = list_first_entry(&rt_last->fib6_siblings,
4255                                       struct fib6_info,
4256                                       fib6_siblings);
4257         }
4258
4259         if (rt)
4260                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4261 }
4262
4263 static int ip6_route_multipath_add(struct fib6_config *cfg,
4264                                    struct netlink_ext_ack *extack)
4265 {
4266         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4267         struct nl_info *info = &cfg->fc_nlinfo;
4268         struct fib6_config r_cfg;
4269         struct rtnexthop *rtnh;
4270         struct fib6_info *rt;
4271         struct rt6_nh *err_nh;
4272         struct rt6_nh *nh, *nh_safe;
4273         __u16 nlflags;
4274         int remaining;
4275         int attrlen;
4276         int err = 1;
4277         int nhn = 0;
4278         int replace = (cfg->fc_nlinfo.nlh &&
4279                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4280         LIST_HEAD(rt6_nh_list);
4281
4282         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4283         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4284                 nlflags |= NLM_F_APPEND;
4285
4286         remaining = cfg->fc_mp_len;
4287         rtnh = (struct rtnexthop *)cfg->fc_mp;
4288
4289         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4290          * fib6_info structs per nexthop
4291          */
4292         while (rtnh_ok(rtnh, remaining)) {
4293                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4294                 if (rtnh->rtnh_ifindex)
4295                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4296
4297                 attrlen = rtnh_attrlen(rtnh);
4298                 if (attrlen > 0) {
4299                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4300
4301                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4302                         if (nla) {
4303                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4304                                 r_cfg.fc_flags |= RTF_GATEWAY;
4305                         }
4306                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4307                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4308                         if (nla)
4309                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4310                 }
4311
4312                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4313                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4314                 if (IS_ERR(rt)) {
4315                         err = PTR_ERR(rt);
4316                         rt = NULL;
4317                         goto cleanup;
4318                 }
4319
4320                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4321
4322                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4323                                             rt, &r_cfg);
4324                 if (err) {
4325                         fib6_info_release(rt);
4326                         goto cleanup;
4327                 }
4328
4329                 rtnh = rtnh_next(rtnh, &remaining);
4330         }
4331
4332         /* for add and replace send one notification with all nexthops.
4333          * Skip the notification in fib6_add_rt2node and send one with
4334          * the full route when done
4335          */
4336         info->skip_notify = 1;
4337
4338         err_nh = NULL;
4339         list_for_each_entry(nh, &rt6_nh_list, next) {
4340                 rt_last = nh->fib6_info;
4341                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4342                 fib6_info_release(nh->fib6_info);
4343
4344                 /* save reference to first route for notification */
4345                 if (!rt_notif && !err)
4346                         rt_notif = nh->fib6_info;
4347
4348                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4349                 nh->fib6_info = NULL;
4350                 if (err) {
4351                         if (replace && nhn)
4352                                 ip6_print_replace_route_err(&rt6_nh_list);
4353                         err_nh = nh;
4354                         goto add_errout;
4355                 }
4356
4357                 /* Because each route is added like a single route we remove
4358                  * these flags after the first nexthop: if there is a collision,
4359                  * we have already failed to add the first nexthop:
4360                  * fib6_add_rt2node() has rejected it; when replacing, old
4361                  * nexthops have been replaced by first new, the rest should
4362                  * be added to it.
4363                  */
4364                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4365                                                      NLM_F_REPLACE);
4366                 nhn++;
4367         }
4368
4369         /* success ... tell user about new route */
4370         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4371         goto cleanup;
4372
4373 add_errout:
4374         /* send notification for routes that were added so that
4375          * the delete notifications sent by ip6_route_del are
4376          * coherent
4377          */
4378         if (rt_notif)
4379                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4380
4381         /* Delete routes that were already added */
4382         list_for_each_entry(nh, &rt6_nh_list, next) {
4383                 if (err_nh == nh)
4384                         break;
4385                 ip6_route_del(&nh->r_cfg, extack);
4386         }
4387
4388 cleanup:
4389         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4390                 if (nh->fib6_info)
4391                         fib6_info_release(nh->fib6_info);
4392                 list_del(&nh->next);
4393                 kfree(nh);
4394         }
4395
4396         return err;
4397 }
4398
4399 static int ip6_route_multipath_del(struct fib6_config *cfg,
4400                                    struct netlink_ext_ack *extack)
4401 {
4402         struct fib6_config r_cfg;
4403         struct rtnexthop *rtnh;
4404         int remaining;
4405         int attrlen;
4406         int err = 1, last_err = 0;
4407
4408         remaining = cfg->fc_mp_len;
4409         rtnh = (struct rtnexthop *)cfg->fc_mp;
4410
4411         /* Parse a Multipath Entry */
4412         while (rtnh_ok(rtnh, remaining)) {
4413                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4414                 if (rtnh->rtnh_ifindex)
4415                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4416
4417                 attrlen = rtnh_attrlen(rtnh);
4418                 if (attrlen > 0) {
4419                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4420
4421                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4422                         if (nla) {
4423                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4424                                 r_cfg.fc_flags |= RTF_GATEWAY;
4425                         }
4426                 }
4427                 err = ip6_route_del(&r_cfg, extack);
4428                 if (err)
4429                         last_err = err;
4430
4431                 rtnh = rtnh_next(rtnh, &remaining);
4432         }
4433
4434         return last_err;
4435 }
4436
4437 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4438                               struct netlink_ext_ack *extack)
4439 {
4440         struct fib6_config cfg;
4441         int err;
4442
4443         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4444         if (err < 0)
4445                 return err;
4446
4447         if (cfg.fc_mp)
4448                 return ip6_route_multipath_del(&cfg, extack);
4449         else {
4450                 cfg.fc_delete_all_nh = 1;
4451                 return ip6_route_del(&cfg, extack);
4452         }
4453 }
4454
4455 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4456                               struct netlink_ext_ack *extack)
4457 {
4458         struct fib6_config cfg;
4459         int err;
4460
4461         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4462         if (err < 0)
4463                 return err;
4464
4465         if (cfg.fc_mp)
4466                 return ip6_route_multipath_add(&cfg, extack);
4467         else
4468                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4469 }
4470
4471 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4472 {
4473         int nexthop_len = 0;
4474
4475         if (rt->fib6_nsiblings) {
4476                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4477                             + NLA_ALIGN(sizeof(struct rtnexthop))
4478                             + nla_total_size(16) /* RTA_GATEWAY */
4479                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4480
4481                 nexthop_len *= rt->fib6_nsiblings;
4482         }
4483
4484         return NLMSG_ALIGN(sizeof(struct rtmsg))
4485                + nla_total_size(16) /* RTA_SRC */
4486                + nla_total_size(16) /* RTA_DST */
4487                + nla_total_size(16) /* RTA_GATEWAY */
4488                + nla_total_size(16) /* RTA_PREFSRC */
4489                + nla_total_size(4) /* RTA_TABLE */
4490                + nla_total_size(4) /* RTA_IIF */
4491                + nla_total_size(4) /* RTA_OIF */
4492                + nla_total_size(4) /* RTA_PRIORITY */
4493                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4494                + nla_total_size(sizeof(struct rta_cacheinfo))
4495                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4496                + nla_total_size(1) /* RTA_PREF */
4497                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4498                + nexthop_len;
4499 }
4500
4501 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4502                             unsigned int *flags, bool skip_oif)
4503 {
4504         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4505                 *flags |= RTNH_F_DEAD;
4506
4507         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4508                 *flags |= RTNH_F_LINKDOWN;
4509
4510                 rcu_read_lock();
4511                 if (fib6_ignore_linkdown(rt))
4512                         *flags |= RTNH_F_DEAD;
4513                 rcu_read_unlock();
4514         }
4515
4516         if (rt->fib6_flags & RTF_GATEWAY) {
4517                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4518                         goto nla_put_failure;
4519         }
4520
4521         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4522         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4523                 *flags |= RTNH_F_OFFLOAD;
4524
4525         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4526         if (!skip_oif && rt->fib6_nh.nh_dev &&
4527             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4528                 goto nla_put_failure;
4529
4530         if (rt->fib6_nh.nh_lwtstate &&
4531             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4532                 goto nla_put_failure;
4533
4534         return 0;
4535
4536 nla_put_failure:
4537         return -EMSGSIZE;
4538 }
4539
4540 /* add multipath next hop */
4541 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4542 {
4543         const struct net_device *dev = rt->fib6_nh.nh_dev;
4544         struct rtnexthop *rtnh;
4545         unsigned int flags = 0;
4546
4547         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4548         if (!rtnh)
4549                 goto nla_put_failure;
4550
4551         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4552         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4553
4554         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4555                 goto nla_put_failure;
4556
4557         rtnh->rtnh_flags = flags;
4558
4559         /* length of rtnetlink header + attributes */
4560         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4561
4562         return 0;
4563
4564 nla_put_failure:
4565         return -EMSGSIZE;
4566 }
4567
4568 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4569                          struct fib6_info *rt, struct dst_entry *dst,
4570                          struct in6_addr *dest, struct in6_addr *src,
4571                          int iif, int type, u32 portid, u32 seq,
4572                          unsigned int flags)
4573 {
4574         struct rtmsg *rtm;
4575         struct nlmsghdr *nlh;
4576         long expires = 0;
4577         u32 *pmetrics;
4578         u32 table;
4579
4580         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4581         if (!nlh)
4582                 return -EMSGSIZE;
4583
4584         rtm = nlmsg_data(nlh);
4585         rtm->rtm_family = AF_INET6;
4586         rtm->rtm_dst_len = rt->fib6_dst.plen;
4587         rtm->rtm_src_len = rt->fib6_src.plen;
4588         rtm->rtm_tos = 0;
4589         if (rt->fib6_table)
4590                 table = rt->fib6_table->tb6_id;
4591         else
4592                 table = RT6_TABLE_UNSPEC;
4593         rtm->rtm_table = table;
4594         if (nla_put_u32(skb, RTA_TABLE, table))
4595                 goto nla_put_failure;
4596
4597         rtm->rtm_type = rt->fib6_type;
4598         rtm->rtm_flags = 0;
4599         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4600         rtm->rtm_protocol = rt->fib6_protocol;
4601
4602         if (rt->fib6_flags & RTF_CACHE)
4603                 rtm->rtm_flags |= RTM_F_CLONED;
4604
4605         if (dest) {
4606                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4607                         goto nla_put_failure;
4608                 rtm->rtm_dst_len = 128;
4609         } else if (rtm->rtm_dst_len)
4610                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4611                         goto nla_put_failure;
4612 #ifdef CONFIG_IPV6_SUBTREES
4613         if (src) {
4614                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4615                         goto nla_put_failure;
4616                 rtm->rtm_src_len = 128;
4617         } else if (rtm->rtm_src_len &&
4618                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4619                 goto nla_put_failure;
4620 #endif
4621         if (iif) {
4622 #ifdef CONFIG_IPV6_MROUTE
4623                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4624                         int err = ip6mr_get_route(net, skb, rtm, portid);
4625
4626                         if (err == 0)
4627                                 return 0;
4628                         if (err < 0)
4629                                 goto nla_put_failure;
4630                 } else
4631 #endif
4632                         if (nla_put_u32(skb, RTA_IIF, iif))
4633                                 goto nla_put_failure;
4634         } else if (dest) {
4635                 struct in6_addr saddr_buf;
4636                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4637                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4638                         goto nla_put_failure;
4639         }
4640
4641         if (rt->fib6_prefsrc.plen) {
4642                 struct in6_addr saddr_buf;
4643                 saddr_buf = rt->fib6_prefsrc.addr;
4644                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4645                         goto nla_put_failure;
4646         }
4647
4648         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4649         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4650                 goto nla_put_failure;
4651
4652         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4653                 goto nla_put_failure;
4654
4655         /* For multipath routes, walk the siblings list and add
4656          * each as a nexthop within RTA_MULTIPATH.
4657          */
4658         if (rt->fib6_nsiblings) {
4659                 struct fib6_info *sibling, *next_sibling;
4660                 struct nlattr *mp;
4661
4662                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4663                 if (!mp)
4664                         goto nla_put_failure;
4665
4666                 if (rt6_add_nexthop(skb, rt) < 0)
4667                         goto nla_put_failure;
4668
4669                 list_for_each_entry_safe(sibling, next_sibling,
4670                                          &rt->fib6_siblings, fib6_siblings) {
4671                         if (rt6_add_nexthop(skb, sibling) < 0)
4672                                 goto nla_put_failure;
4673                 }
4674
4675                 nla_nest_end(skb, mp);
4676         } else {
4677                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4678                         goto nla_put_failure;
4679         }
4680
4681         if (rt->fib6_flags & RTF_EXPIRES) {
4682                 expires = dst ? dst->expires : rt->expires;
4683                 expires -= jiffies;
4684         }
4685
4686         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4687                 goto nla_put_failure;
4688
4689         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4690                 goto nla_put_failure;
4691
4692
4693         nlmsg_end(skb, nlh);
4694         return 0;
4695
4696 nla_put_failure:
4697         nlmsg_cancel(skb, nlh);
4698         return -EMSGSIZE;
4699 }
4700
4701 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4702 {
4703         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4704         struct net *net = arg->net;
4705
4706         if (rt == net->ipv6.fib6_null_entry)
4707                 return 0;
4708
4709         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4710                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4711
4712                 /* user wants prefix routes only */
4713                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4714                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4715                         /* success since this is not a prefix route */
4716                         return 1;
4717                 }
4718         }
4719
4720         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4721                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4722                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4723 }
4724
4725 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4726                               struct netlink_ext_ack *extack)
4727 {
4728         struct net *net = sock_net(in_skb->sk);
4729         struct nlattr *tb[RTA_MAX+1];
4730         int err, iif = 0, oif = 0;
4731         struct fib6_info *from;
4732         struct dst_entry *dst;
4733         struct rt6_info *rt;
4734         struct sk_buff *skb;
4735         struct rtmsg *rtm;
4736         struct flowi6 fl6;
4737         bool fibmatch;
4738
4739         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4740                           extack);
4741         if (err < 0)
4742                 goto errout;
4743
4744         err = -EINVAL;
4745         memset(&fl6, 0, sizeof(fl6));
4746         rtm = nlmsg_data(nlh);
4747         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4748         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4749
4750         if (tb[RTA_SRC]) {
4751                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4752                         goto errout;
4753
4754                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4755         }
4756
4757         if (tb[RTA_DST]) {
4758                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4759                         goto errout;
4760
4761                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4762         }
4763
4764         if (tb[RTA_IIF])
4765                 iif = nla_get_u32(tb[RTA_IIF]);
4766
4767         if (tb[RTA_OIF])
4768                 oif = nla_get_u32(tb[RTA_OIF]);
4769
4770         if (tb[RTA_MARK])
4771                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4772
4773         if (tb[RTA_UID])
4774                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4775                                            nla_get_u32(tb[RTA_UID]));
4776         else
4777                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4778
4779         if (iif) {
4780                 struct net_device *dev;
4781                 int flags = 0;
4782
4783                 rcu_read_lock();
4784
4785                 dev = dev_get_by_index_rcu(net, iif);
4786                 if (!dev) {
4787                         rcu_read_unlock();
4788                         err = -ENODEV;
4789                         goto errout;
4790                 }
4791
4792                 fl6.flowi6_iif = iif;
4793
4794                 if (!ipv6_addr_any(&fl6.saddr))
4795                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4796
4797                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4798
4799                 rcu_read_unlock();
4800         } else {
4801                 fl6.flowi6_oif = oif;
4802
4803                 dst = ip6_route_output(net, NULL, &fl6);
4804         }
4805
4806
4807         rt = container_of(dst, struct rt6_info, dst);
4808         if (rt->dst.error) {
4809                 err = rt->dst.error;
4810                 ip6_rt_put(rt);
4811                 goto errout;
4812         }
4813
4814         if (rt == net->ipv6.ip6_null_entry) {
4815                 err = rt->dst.error;
4816                 ip6_rt_put(rt);
4817                 goto errout;
4818         }
4819
4820         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4821         if (!skb) {
4822                 ip6_rt_put(rt);
4823                 err = -ENOBUFS;
4824                 goto errout;
4825         }
4826
4827         skb_dst_set(skb, &rt->dst);
4828
4829         rcu_read_lock();
4830         from = rcu_dereference(rt->from);
4831
4832         if (fibmatch)
4833                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4834                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4835                                     nlh->nlmsg_seq, 0);
4836         else
4837                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4838                                     &fl6.saddr, iif, RTM_NEWROUTE,
4839                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4840                                     0);
4841         rcu_read_unlock();
4842
4843         if (err < 0) {
4844                 kfree_skb(skb);
4845                 goto errout;
4846         }
4847
4848         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4849 errout:
4850         return err;
4851 }
4852
4853 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4854                      unsigned int nlm_flags)
4855 {
4856         struct sk_buff *skb;
4857         struct net *net = info->nl_net;
4858         u32 seq;
4859         int err;
4860
4861         err = -ENOBUFS;
4862         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4863
4864         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4865         if (!skb)
4866                 goto errout;
4867
4868         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4869                             event, info->portid, seq, nlm_flags);
4870         if (err < 0) {
4871                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4872                 WARN_ON(err == -EMSGSIZE);
4873                 kfree_skb(skb);
4874                 goto errout;
4875         }
4876         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4877                     info->nlh, gfp_any());
4878         return;
4879 errout:
4880         if (err < 0)
4881                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4882 }
4883
4884 static int ip6_route_dev_notify(struct notifier_block *this,
4885                                 unsigned long event, void *ptr)
4886 {
4887         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4888         struct net *net = dev_net(dev);
4889
4890         if (!(dev->flags & IFF_LOOPBACK))
4891                 return NOTIFY_OK;
4892
4893         if (event == NETDEV_REGISTER) {
4894                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4895                 net->ipv6.ip6_null_entry->dst.dev = dev;
4896                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4898                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4899                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4900                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4901                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4902 #endif
4903          } else if (event == NETDEV_UNREGISTER &&
4904                     dev->reg_state != NETREG_UNREGISTERED) {
4905                 /* NETDEV_UNREGISTER could be fired for multiple times by
4906                  * netdev_wait_allrefs(). Make sure we only call this once.
4907                  */
4908                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4910                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4911                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4912 #endif
4913         }
4914
4915         return NOTIFY_OK;
4916 }
4917
4918 /*
4919  *      /proc
4920  */
4921
4922 #ifdef CONFIG_PROC_FS
4923
4924 static const struct file_operations ipv6_route_proc_fops = {
4925         .open           = ipv6_route_open,
4926         .read           = seq_read,
4927         .llseek         = seq_lseek,
4928         .release        = seq_release_net,
4929 };
4930
4931 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4932 {
4933         struct net *net = (struct net *)seq->private;
4934         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4935                    net->ipv6.rt6_stats->fib_nodes,
4936                    net->ipv6.rt6_stats->fib_route_nodes,
4937                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4938                    net->ipv6.rt6_stats->fib_rt_entries,
4939                    net->ipv6.rt6_stats->fib_rt_cache,
4940                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4941                    net->ipv6.rt6_stats->fib_discarded_routes);
4942
4943         return 0;
4944 }
4945
4946 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4947 {
4948         return single_open_net(inode, file, rt6_stats_seq_show);
4949 }
4950
4951 static const struct file_operations rt6_stats_seq_fops = {
4952         .open    = rt6_stats_seq_open,
4953         .read    = seq_read,
4954         .llseek  = seq_lseek,
4955         .release = single_release_net,
4956 };
4957 #endif  /* CONFIG_PROC_FS */
4958
4959 #ifdef CONFIG_SYSCTL
4960
4961 static
4962 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4963                               void __user *buffer, size_t *lenp, loff_t *ppos)
4964 {
4965         struct net *net;
4966         int delay;
4967         if (!write)
4968                 return -EINVAL;
4969
4970         net = (struct net *)ctl->extra1;
4971         delay = net->ipv6.sysctl.flush_delay;
4972         proc_dointvec(ctl, write, buffer, lenp, ppos);
4973         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4974         return 0;
4975 }
4976
4977 struct ctl_table ipv6_route_table_template[] = {
4978         {
4979                 .procname       =       "flush",
4980                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4981                 .maxlen         =       sizeof(int),
4982                 .mode           =       0200,
4983                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4984         },
4985         {
4986                 .procname       =       "gc_thresh",
4987                 .data           =       &ip6_dst_ops_template.gc_thresh,
4988                 .maxlen         =       sizeof(int),
4989                 .mode           =       0644,
4990                 .proc_handler   =       proc_dointvec,
4991         },
4992         {
4993                 .procname       =       "max_size",
4994                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4995                 .maxlen         =       sizeof(int),
4996                 .mode           =       0644,
4997                 .proc_handler   =       proc_dointvec,
4998         },
4999         {
5000                 .procname       =       "gc_min_interval",
5001                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5002                 .maxlen         =       sizeof(int),
5003                 .mode           =       0644,
5004                 .proc_handler   =       proc_dointvec_jiffies,
5005         },
5006         {
5007                 .procname       =       "gc_timeout",
5008                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5009                 .maxlen         =       sizeof(int),
5010                 .mode           =       0644,
5011                 .proc_handler   =       proc_dointvec_jiffies,
5012         },
5013         {
5014                 .procname       =       "gc_interval",
5015                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5016                 .maxlen         =       sizeof(int),
5017                 .mode           =       0644,
5018                 .proc_handler   =       proc_dointvec_jiffies,
5019         },
5020         {
5021                 .procname       =       "gc_elasticity",
5022                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5023                 .maxlen         =       sizeof(int),
5024                 .mode           =       0644,
5025                 .proc_handler   =       proc_dointvec,
5026         },
5027         {
5028                 .procname       =       "mtu_expires",
5029                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5030                 .maxlen         =       sizeof(int),
5031                 .mode           =       0644,
5032                 .proc_handler   =       proc_dointvec_jiffies,
5033         },
5034         {
5035                 .procname       =       "min_adv_mss",
5036                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5037                 .maxlen         =       sizeof(int),
5038                 .mode           =       0644,
5039                 .proc_handler   =       proc_dointvec,
5040         },
5041         {
5042                 .procname       =       "gc_min_interval_ms",
5043                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5044                 .maxlen         =       sizeof(int),
5045                 .mode           =       0644,
5046                 .proc_handler   =       proc_dointvec_ms_jiffies,
5047         },
5048         { }
5049 };
5050
5051 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5052 {
5053         struct ctl_table *table;
5054
5055         table = kmemdup(ipv6_route_table_template,
5056                         sizeof(ipv6_route_table_template),
5057                         GFP_KERNEL);
5058
5059         if (table) {
5060                 table[0].data = &net->ipv6.sysctl.flush_delay;
5061                 table[0].extra1 = net;
5062                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5063                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5064                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5065                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5066                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5067                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5068                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5069                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5070                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5071
5072                 /* Don't export sysctls to unprivileged users */
5073                 if (net->user_ns != &init_user_ns)
5074                         table[0].procname = NULL;
5075         }
5076
5077         return table;
5078 }
5079 #endif
5080
5081 static int __net_init ip6_route_net_init(struct net *net)
5082 {
5083         int ret = -ENOMEM;
5084
5085         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5086                sizeof(net->ipv6.ip6_dst_ops));
5087
5088         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5089                 goto out_ip6_dst_ops;
5090
5091         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5092                                             sizeof(*net->ipv6.fib6_null_entry),
5093                                             GFP_KERNEL);
5094         if (!net->ipv6.fib6_null_entry)
5095                 goto out_ip6_dst_entries;
5096
5097         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5098                                            sizeof(*net->ipv6.ip6_null_entry),
5099                                            GFP_KERNEL);
5100         if (!net->ipv6.ip6_null_entry)
5101                 goto out_fib6_null_entry;
5102         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5103         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5104                          ip6_template_metrics, true);
5105
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107         net->ipv6.fib6_has_custom_rules = false;
5108         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5109                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5110                                                GFP_KERNEL);
5111         if (!net->ipv6.ip6_prohibit_entry)
5112                 goto out_ip6_null_entry;
5113         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5114         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5115                          ip6_template_metrics, true);
5116
5117         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5118                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5119                                                GFP_KERNEL);
5120         if (!net->ipv6.ip6_blk_hole_entry)
5121                 goto out_ip6_prohibit_entry;
5122         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5123         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5124                          ip6_template_metrics, true);
5125 #endif
5126
5127         net->ipv6.sysctl.flush_delay = 0;
5128         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5129         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5130         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5131         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5132         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5133         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5134         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5135
5136         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5137
5138         ret = 0;
5139 out:
5140         return ret;
5141
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143 out_ip6_prohibit_entry:
5144         kfree(net->ipv6.ip6_prohibit_entry);
5145 out_ip6_null_entry:
5146         kfree(net->ipv6.ip6_null_entry);
5147 #endif
5148 out_fib6_null_entry:
5149         kfree(net->ipv6.fib6_null_entry);
5150 out_ip6_dst_entries:
5151         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5152 out_ip6_dst_ops:
5153         goto out;
5154 }
5155
5156 static void __net_exit ip6_route_net_exit(struct net *net)
5157 {
5158         kfree(net->ipv6.fib6_null_entry);
5159         kfree(net->ipv6.ip6_null_entry);
5160 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5161         kfree(net->ipv6.ip6_prohibit_entry);
5162         kfree(net->ipv6.ip6_blk_hole_entry);
5163 #endif
5164         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5165 }
5166
5167 static int __net_init ip6_route_net_init_late(struct net *net)
5168 {
5169 #ifdef CONFIG_PROC_FS
5170         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5171         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5172 #endif
5173         return 0;
5174 }
5175
5176 static void __net_exit ip6_route_net_exit_late(struct net *net)
5177 {
5178 #ifdef CONFIG_PROC_FS
5179         remove_proc_entry("ipv6_route", net->proc_net);
5180         remove_proc_entry("rt6_stats", net->proc_net);
5181 #endif
5182 }
5183
5184 static struct pernet_operations ip6_route_net_ops = {
5185         .init = ip6_route_net_init,
5186         .exit = ip6_route_net_exit,
5187 };
5188
5189 static int __net_init ipv6_inetpeer_init(struct net *net)
5190 {
5191         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5192
5193         if (!bp)
5194                 return -ENOMEM;
5195         inet_peer_base_init(bp);
5196         net->ipv6.peers = bp;
5197         return 0;
5198 }
5199
5200 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5201 {
5202         struct inet_peer_base *bp = net->ipv6.peers;
5203
5204         net->ipv6.peers = NULL;
5205         inetpeer_invalidate_tree(bp);
5206         kfree(bp);
5207 }
5208
5209 static struct pernet_operations ipv6_inetpeer_ops = {
5210         .init   =       ipv6_inetpeer_init,
5211         .exit   =       ipv6_inetpeer_exit,
5212 };
5213
5214 static struct pernet_operations ip6_route_net_late_ops = {
5215         .init = ip6_route_net_init_late,
5216         .exit = ip6_route_net_exit_late,
5217 };
5218
5219 static struct notifier_block ip6_route_dev_notifier = {
5220         .notifier_call = ip6_route_dev_notify,
5221         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5222 };
5223
5224 void __init ip6_route_init_special_entries(void)
5225 {
5226         /* Registering of the loopback is done before this portion of code,
5227          * the loopback reference in rt6_info will not be taken, do it
5228          * manually for init_net */
5229         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5230         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5231         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5232   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5233         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5234         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5235         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5236         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5237   #endif
5238 }
5239
5240 int __init ip6_route_init(void)
5241 {
5242         int ret;
5243         int cpu;
5244
5245         ret = -ENOMEM;
5246         ip6_dst_ops_template.kmem_cachep =
5247                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5248                                   SLAB_HWCACHE_ALIGN, NULL);
5249         if (!ip6_dst_ops_template.kmem_cachep)
5250                 goto out;
5251
5252         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5253         if (ret)
5254                 goto out_kmem_cache;
5255
5256         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5257         if (ret)
5258                 goto out_dst_entries;
5259
5260         ret = register_pernet_subsys(&ip6_route_net_ops);
5261         if (ret)
5262                 goto out_register_inetpeer;
5263
5264         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5265
5266         ret = fib6_init();
5267         if (ret)
5268                 goto out_register_subsys;
5269
5270         ret = xfrm6_init();
5271         if (ret)
5272                 goto out_fib6_init;
5273
5274         ret = fib6_rules_init();
5275         if (ret)
5276                 goto xfrm6_init;
5277
5278         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5279         if (ret)
5280                 goto fib6_rules_init;
5281
5282         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5283                                    inet6_rtm_newroute, NULL, 0);
5284         if (ret < 0)
5285                 goto out_register_late_subsys;
5286
5287         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5288                                    inet6_rtm_delroute, NULL, 0);
5289         if (ret < 0)
5290                 goto out_register_late_subsys;
5291
5292         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5293                                    inet6_rtm_getroute, NULL,
5294                                    RTNL_FLAG_DOIT_UNLOCKED);
5295         if (ret < 0)
5296                 goto out_register_late_subsys;
5297
5298         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5299         if (ret)
5300                 goto out_register_late_subsys;
5301
5302         for_each_possible_cpu(cpu) {
5303                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5304
5305                 INIT_LIST_HEAD(&ul->head);
5306                 spin_lock_init(&ul->lock);
5307         }
5308
5309 out:
5310         return ret;
5311
5312 out_register_late_subsys:
5313         rtnl_unregister_all(PF_INET6);
5314         unregister_pernet_subsys(&ip6_route_net_late_ops);
5315 fib6_rules_init:
5316         fib6_rules_cleanup();
5317 xfrm6_init:
5318         xfrm6_fini();
5319 out_fib6_init:
5320         fib6_gc_cleanup();
5321 out_register_subsys:
5322         unregister_pernet_subsys(&ip6_route_net_ops);
5323 out_register_inetpeer:
5324         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5325 out_dst_entries:
5326         dst_entries_destroy(&ip6_dst_blackhole_ops);
5327 out_kmem_cache:
5328         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5329         goto out;
5330 }
5331
5332 void ip6_route_cleanup(void)
5333 {
5334         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5335         unregister_pernet_subsys(&ip6_route_net_late_ops);
5336         fib6_rules_cleanup();
5337         xfrm6_fini();
5338         fib6_gc_cleanup();
5339         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5340         unregister_pernet_subsys(&ip6_route_net_ops);
5341         dst_entries_destroy(&ip6_dst_blackhole_ops);
5342         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5343 }