]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
354a5b8d016ffa37f7969aa13d6b3a02d3e31691
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102                          struct fib6_info *rt, struct dst_entry *dst,
103                          struct in6_addr *dest, struct in6_addr *src,
104                          int iif, int type, u32 portid, u32 seq,
105                          unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107                                            struct in6_addr *daddr,
108                                            struct in6_addr *saddr);
109
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112                                            const struct in6_addr *prefix, int prefixlen,
113                                            const struct in6_addr *gwaddr,
114                                            struct net_device *dev,
115                                            unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117                                            const struct in6_addr *prefix, int prefixlen,
118                                            const struct in6_addr *gwaddr,
119                                            struct net_device *dev);
120 #endif
121
122 struct uncached_list {
123         spinlock_t              lock;
124         struct list_head        head;
125 };
126
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132
133         rt->rt6i_uncached_list = ul;
134
135         spin_lock_bh(&ul->lock);
136         list_add_tail(&rt->rt6i_uncached, &ul->head);
137         spin_unlock_bh(&ul->lock);
138 }
139
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142         if (!list_empty(&rt->rt6i_uncached)) {
143                 struct uncached_list *ul = rt->rt6i_uncached_list;
144                 struct net *net = dev_net(rt->dst.dev);
145
146                 spin_lock_bh(&ul->lock);
147                 list_del(&rt->rt6i_uncached);
148                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149                 spin_unlock_bh(&ul->lock);
150         }
151 }
152
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155         struct net_device *loopback_dev = net->loopback_dev;
156         int cpu;
157
158         if (dev == loopback_dev)
159                 return;
160
161         for_each_possible_cpu(cpu) {
162                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163                 struct rt6_info *rt;
164
165                 spin_lock_bh(&ul->lock);
166                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167                         struct inet6_dev *rt_idev = rt->rt6i_idev;
168                         struct net_device *rt_dev = rt->dst.dev;
169
170                         if (rt_idev->dev == dev) {
171                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
172                                 in6_dev_put(rt_idev);
173                         }
174
175                         if (rt_dev == dev) {
176                                 rt->dst.dev = loopback_dev;
177                                 dev_hold(rt->dst.dev);
178                                 dev_put(rt_dev);
179                         }
180                 }
181                 spin_unlock_bh(&ul->lock);
182         }
183 }
184
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         if (!ipv6_addr_any(p))
190                 return (const void *) p;
191         else if (skb)
192                 return &ipv6_hdr(skb)->daddr;
193         return daddr;
194 }
195
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197                                    struct net_device *dev,
198                                    struct sk_buff *skb,
199                                    const void *daddr)
200 {
201         struct neighbour *n;
202
203         daddr = choose_neigh_daddr(gw, skb, daddr);
204         n = __ipv6_neigh_lookup(dev, daddr);
205         if (n)
206                 return n;
207         return neigh_create(&nd_tbl, daddr, dev);
208 }
209
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211                                               struct sk_buff *skb,
212                                               const void *daddr)
213 {
214         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215
216         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221         struct net_device *dev = dst->dev;
222         struct rt6_info *rt = (struct rt6_info *)dst;
223
224         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225         if (!daddr)
226                 return;
227         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228                 return;
229         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230                 return;
231         __ipv6_confirm_neigh(dev, daddr);
232 }
233
234 static struct dst_ops ip6_dst_ops_template = {
235         .family                 =       AF_INET6,
236         .gc                     =       ip6_dst_gc,
237         .gc_thresh              =       1024,
238         .check                  =       ip6_dst_check,
239         .default_advmss         =       ip6_default_advmss,
240         .mtu                    =       ip6_mtu,
241         .cow_metrics            =       dst_cow_metrics_generic,
242         .destroy                =       ip6_dst_destroy,
243         .ifdown                 =       ip6_dst_ifdown,
244         .negative_advice        =       ip6_negative_advice,
245         .link_failure           =       ip6_link_failure,
246         .update_pmtu            =       ip6_rt_update_pmtu,
247         .redirect               =       rt6_do_redirect,
248         .local_out              =       __ip6_local_out,
249         .neigh_lookup           =       ip6_dst_neigh_lookup,
250         .confirm_neigh          =       ip6_confirm_neigh,
251 };
252
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256
257         return mtu ? : dst->dev->mtu;
258 }
259
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261                                          struct sk_buff *skb, u32 mtu)
262 {
263 }
264
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266                                       struct sk_buff *skb)
267 {
268 }
269
270 static struct dst_ops ip6_dst_blackhole_ops = {
271         .family                 =       AF_INET6,
272         .destroy                =       ip6_dst_destroy,
273         .check                  =       ip6_dst_check,
274         .mtu                    =       ip6_blackhole_mtu,
275         .default_advmss         =       ip6_default_advmss,
276         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
277         .redirect               =       ip6_rt_blackhole_redirect,
278         .cow_metrics            =       dst_cow_metrics_generic,
279         .neigh_lookup           =       ip6_dst_neigh_lookup,
280 };
281
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283         [RTAX_HOPLIMIT - 1] = 0,
284 };
285
286 static const struct fib6_info fib6_null_entry_template = {
287         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
288         .fib6_protocol  = RTPROT_KERNEL,
289         .fib6_metric    = ~(u32)0,
290         .fib6_ref       = ATOMIC_INIT(1),
291         .fib6_type      = RTN_UNREACHABLE,
292         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
293 };
294
295 static const struct rt6_info ip6_null_entry_template = {
296         .dst = {
297                 .__refcnt       = ATOMIC_INIT(1),
298                 .__use          = 1,
299                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
300                 .error          = -ENETUNREACH,
301                 .input          = ip6_pkt_discard,
302                 .output         = ip6_pkt_discard_out,
303         },
304         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319 };
320
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322         .dst = {
323                 .__refcnt       = ATOMIC_INIT(1),
324                 .__use          = 1,
325                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
326                 .error          = -EINVAL,
327                 .input          = dst_discard,
328                 .output         = dst_discard_out,
329         },
330         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
331 };
332
333 #endif
334
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337         struct dst_entry *dst = &rt->dst;
338
339         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340         INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348                                         1, DST_OBSOLETE_FORCE_CHK, flags);
349
350         if (rt) {
351                 rt6_info_init(rt);
352                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353         }
354
355         return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361         struct rt6_info *rt = (struct rt6_info *)dst;
362         struct fib6_info *from;
363         struct inet6_dev *idev;
364
365         dst_destroy_metrics_generic(dst);
366         rt6_uncached_list_del(rt);
367
368         idev = rt->rt6i_idev;
369         if (idev) {
370                 rt->rt6i_idev = NULL;
371                 in6_dev_put(idev);
372         }
373
374         rcu_read_lock();
375         from = rcu_dereference(rt->from);
376         rcu_assign_pointer(rt->from, NULL);
377         fib6_info_release(from);
378         rcu_read_unlock();
379 }
380
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382                            int how)
383 {
384         struct rt6_info *rt = (struct rt6_info *)dst;
385         struct inet6_dev *idev = rt->rt6i_idev;
386         struct net_device *loopback_dev =
387                 dev_net(dev)->loopback_dev;
388
389         if (idev && idev->dev != loopback_dev) {
390                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391                 if (loopback_idev) {
392                         rt->rt6i_idev = loopback_idev;
393                         in6_dev_put(idev);
394                 }
395         }
396 }
397
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400         if (rt->rt6i_flags & RTF_EXPIRES)
401                 return time_after(jiffies, rt->dst.expires);
402         else
403                 return false;
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         struct fib6_info *from;
409
410         from = rcu_dereference(rt->from);
411
412         if (rt->rt6i_flags & RTF_EXPIRES) {
413                 if (time_after(jiffies, rt->dst.expires))
414                         return true;
415         } else if (from) {
416                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417                         fib6_check_expired(from);
418         }
419         return false;
420 }
421
422 static struct fib6_info *rt6_multipath_select(const struct net *net,
423                                               struct fib6_info *match,
424                                              struct flowi6 *fl6, int oif,
425                                              const struct sk_buff *skb,
426                                              int strict)
427 {
428         struct fib6_info *sibling, *next_sibling;
429
430         /* We might have already computed the hash for ICMPv6 errors. In such
431          * case it will always be non-zero. Otherwise now is the time to do it.
432          */
433         if (!fl6->mp_hash)
434                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435
436         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437                 return match;
438
439         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440                                  fib6_siblings) {
441                 int nh_upper_bound;
442
443                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444                 if (fl6->mp_hash > nh_upper_bound)
445                         continue;
446                 if (rt6_score_route(sibling, oif, strict) < 0)
447                         break;
448                 match = sibling;
449                 break;
450         }
451
452         return match;
453 }
454
455 /*
456  *      Route lookup. rcu_read_lock() should be held.
457  */
458
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460                                                  struct fib6_info *rt,
461                                                     const struct in6_addr *saddr,
462                                                     int oif,
463                                                     int flags)
464 {
465         struct fib6_info *sprt;
466
467         if (!oif && ipv6_addr_any(saddr) &&
468             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469                 return rt;
470
471         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
472                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
473
474                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475                         continue;
476
477                 if (oif) {
478                         if (dev->ifindex == oif)
479                                 return sprt;
480                 } else {
481                         if (ipv6_chk_addr(net, saddr, dev,
482                                           flags & RT6_LOOKUP_F_IFACE))
483                                 return sprt;
484                 }
485         }
486
487         if (oif && flags & RT6_LOOKUP_F_IFACE)
488                 return net->ipv6.fib6_null_entry;
489
490         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495         struct work_struct work;
496         struct in6_addr target;
497         struct net_device *dev;
498 };
499
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502         struct in6_addr mcaddr;
503         struct __rt6_probe_work *work =
504                 container_of(w, struct __rt6_probe_work, work);
505
506         addrconf_addr_solict_mult(&work->target, &mcaddr);
507         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508         dev_put(work->dev);
509         kfree(work);
510 }
511
512 static void rt6_probe(struct fib6_info *rt)
513 {
514         struct __rt6_probe_work *work;
515         const struct in6_addr *nh_gw;
516         struct neighbour *neigh;
517         struct net_device *dev;
518
519         /*
520          * Okay, this does not seem to be appropriate
521          * for now, however, we need to check if it
522          * is really so; aka Router Reachability Probing.
523          *
524          * Router Reachability Probe MUST be rate-limited
525          * to no more than one per minute.
526          */
527         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528                 return;
529
530         nh_gw = &rt->fib6_nh.nh_gw;
531         dev = rt->fib6_nh.nh_dev;
532         rcu_read_lock_bh();
533         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534         if (neigh) {
535                 struct inet6_dev *idev;
536
537                 if (neigh->nud_state & NUD_VALID)
538                         goto out;
539
540                 idev = __in6_dev_get(dev);
541                 work = NULL;
542                 write_lock(&neigh->lock);
543                 if (!(neigh->nud_state & NUD_VALID) &&
544                     time_after(jiffies,
545                                neigh->updated + idev->cnf.rtr_probe_interval)) {
546                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
547                         if (work)
548                                 __neigh_set_probe_once(neigh);
549                 }
550                 write_unlock(&neigh->lock);
551         } else {
552                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553         }
554
555         if (work) {
556                 INIT_WORK(&work->work, rt6_probe_deferred);
557                 work->target = *nh_gw;
558                 dev_hold(dev);
559                 work->dev = dev;
560                 schedule_work(&work->work);
561         }
562
563 out:
564         rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577         const struct net_device *dev = rt->fib6_nh.nh_dev;
578
579         if (!oif || dev->ifindex == oif)
580                 return 2;
581         return 0;
582 }
583
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587         struct neighbour *neigh;
588
589         if (rt->fib6_flags & RTF_NONEXTHOP ||
590             !(rt->fib6_flags & RTF_GATEWAY))
591                 return RT6_NUD_SUCCEED;
592
593         rcu_read_lock_bh();
594         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595                                           &rt->fib6_nh.nh_gw);
596         if (neigh) {
597                 read_lock(&neigh->lock);
598                 if (neigh->nud_state & NUD_VALID)
599                         ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601                 else if (!(neigh->nud_state & NUD_FAILED))
602                         ret = RT6_NUD_SUCCEED;
603                 else
604                         ret = RT6_NUD_FAIL_PROBE;
605 #endif
606                 read_unlock(&neigh->lock);
607         } else {
608                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610         }
611         rcu_read_unlock_bh();
612
613         return ret;
614 }
615
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618         int m;
619
620         m = rt6_check_dev(rt, oif);
621         if (!m && (strict & RT6_LOOKUP_F_IFACE))
622                 return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626         if (strict & RT6_LOOKUP_F_REACHABLE) {
627                 int n = rt6_check_neigh(rt);
628                 if (n < 0)
629                         return n;
630         }
631         return m;
632 }
633
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637         const struct net_device *dev = fib6_info_nh_dev(f6i);
638         bool rc = false;
639
640         if (dev) {
641                 const struct inet6_dev *idev = __in6_dev_get(dev);
642
643                 rc = !!idev->cnf.ignore_routes_with_linkdown;
644         }
645
646         return rc;
647 }
648
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650                                    int *mpri, struct fib6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655
656         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657                 goto out;
658
659         if (fib6_ignore_linkdown(rt) &&
660             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662                 goto out;
663
664         if (fib6_check_expired(rt))
665                 goto out;
666
667         m = rt6_score_route(rt, oif, strict);
668         if (m == RT6_NUD_FAIL_DO_RR) {
669                 match_do_rr = true;
670                 m = 0; /* lowest valid score */
671         } else if (m == RT6_NUD_FAIL_HARD) {
672                 goto out;
673         }
674
675         if (strict & RT6_LOOKUP_F_REACHABLE)
676                 rt6_probe(rt);
677
678         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
679         if (m > *mpri) {
680                 *do_rr = match_do_rr;
681                 *mpri = m;
682                 match = rt;
683         }
684 out:
685         return match;
686 }
687
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689                                      struct fib6_info *leaf,
690                                      struct fib6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct fib6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
700                 if (rt->fib6_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = leaf; rt && rt != rr_head;
709              rt = rcu_dereference(rt->rt6_next)) {
710                 if (rt->fib6_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728                                    int oif, int strict)
729 {
730         struct fib6_info *leaf = rcu_dereference(fn->leaf);
731         struct fib6_info *match, *rt0;
732         bool do_rr = false;
733         int key_plen;
734
735         if (!leaf || leaf == net->ipv6.fib6_null_entry)
736                 return net->ipv6.fib6_null_entry;
737
738         rt0 = rcu_dereference(fn->rr_ptr);
739         if (!rt0)
740                 rt0 = leaf;
741
742         /* Double check to make sure fn is not an intermediate node
743          * and fn->leaf does not points to its child's leaf
744          * (This might happen if all routes under fn are deleted from
745          * the tree and fib6_repair_tree() is called on the node.)
746          */
747         key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749         if (rt0->fib6_src.plen)
750                 key_plen = rt0->fib6_src.plen;
751 #endif
752         if (fn->fn_bit != key_plen)
753                 return net->ipv6.fib6_null_entry;
754
755         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756                              &do_rr);
757
758         if (do_rr) {
759                 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
760
761                 /* no entries matched; do round-robin */
762                 if (!next || next->fib6_metric != rt0->fib6_metric)
763                         next = leaf;
764
765                 if (next != rt0) {
766                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
767                         /* make sure next is not being deleted from the tree */
768                         if (next->fib6_node)
769                                 rcu_assign_pointer(fn->rr_ptr, next);
770                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771                 }
772         }
773
774         return match ? match : net->ipv6.fib6_null_entry;
775 }
776
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784                   const struct in6_addr *gwaddr)
785 {
786         struct net *net = dev_net(dev);
787         struct route_info *rinfo = (struct route_info *) opt;
788         struct in6_addr prefix_buf, *prefix;
789         unsigned int pref;
790         unsigned long lifetime;
791         struct fib6_info *rt;
792
793         if (len < sizeof(struct route_info)) {
794                 return -EINVAL;
795         }
796
797         /* Sanity check for prefix_len and length */
798         if (rinfo->length > 3) {
799                 return -EINVAL;
800         } else if (rinfo->prefix_len > 128) {
801                 return -EINVAL;
802         } else if (rinfo->prefix_len > 64) {
803                 if (rinfo->length < 2) {
804                         return -EINVAL;
805                 }
806         } else if (rinfo->prefix_len > 0) {
807                 if (rinfo->length < 1) {
808                         return -EINVAL;
809                 }
810         }
811
812         pref = rinfo->route_pref;
813         if (pref == ICMPV6_ROUTER_PREF_INVALID)
814                 return -EINVAL;
815
816         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817
818         if (rinfo->length == 3)
819                 prefix = (struct in6_addr *)rinfo->prefix;
820         else {
821                 /* this function is safe */
822                 ipv6_addr_prefix(&prefix_buf,
823                                  (struct in6_addr *)rinfo->prefix,
824                                  rinfo->prefix_len);
825                 prefix = &prefix_buf;
826         }
827
828         if (rinfo->prefix_len == 0)
829                 rt = rt6_get_dflt_router(net, gwaddr, dev);
830         else
831                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832                                         gwaddr, dev);
833
834         if (rt && !lifetime) {
835                 ip6_del_rt(net, rt);
836                 rt = NULL;
837         }
838
839         if (!rt && lifetime)
840                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841                                         dev, pref);
842         else if (rt)
843                 rt->fib6_flags = RTF_ROUTEINFO |
844                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845
846         if (rt) {
847                 if (!addrconf_finite_timeout(lifetime))
848                         fib6_clean_expires(rt);
849                 else
850                         fib6_set_expires(rt, jiffies + HZ * lifetime);
851
852                 fib6_info_release(rt);
853         }
854         return 0;
855 }
856 #endif
857
858 /*
859  *      Misc support functions
860  */
861
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865         struct net_device *dev = rt->fib6_nh.nh_dev;
866
867         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868                 /* for copies of local routes, dst->dev needs to be the
869                  * device if it is a master device, the master device if
870                  * device is enslaved, and the loopback as the default
871                  */
872                 if (netif_is_l3_slave(dev) &&
873                     !rt6_need_strict(&rt->fib6_dst.addr))
874                         dev = l3mdev_master_dev_rcu(dev);
875                 else if (!netif_is_l3_master(dev))
876                         dev = dev_net(dev)->loopback_dev;
877                 /* last case is netif_is_l3_master(dev) is true in which
878                  * case we want dev returned to be dev
879                  */
880         }
881
882         return dev;
883 }
884
885 static const int fib6_prop[RTN_MAX + 1] = {
886         [RTN_UNSPEC]    = 0,
887         [RTN_UNICAST]   = 0,
888         [RTN_LOCAL]     = 0,
889         [RTN_BROADCAST] = 0,
890         [RTN_ANYCAST]   = 0,
891         [RTN_MULTICAST] = 0,
892         [RTN_BLACKHOLE] = -EINVAL,
893         [RTN_UNREACHABLE] = -EHOSTUNREACH,
894         [RTN_PROHIBIT]  = -EACCES,
895         [RTN_THROW]     = -EAGAIN,
896         [RTN_NAT]       = -EINVAL,
897         [RTN_XRESOLVE]  = -EINVAL,
898 };
899
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902         return fib6_prop[fib6_type];
903 }
904
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907         unsigned short flags = 0;
908
909         if (rt->dst_nocount)
910                 flags |= DST_NOCOUNT;
911         if (rt->dst_nopolicy)
912                 flags |= DST_NOPOLICY;
913         if (rt->dst_host)
914                 flags |= DST_HOST;
915
916         return flags;
917 }
918
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922
923         switch (ort->fib6_type) {
924         case RTN_BLACKHOLE:
925                 rt->dst.output = dst_discard_out;
926                 rt->dst.input = dst_discard;
927                 break;
928         case RTN_PROHIBIT:
929                 rt->dst.output = ip6_pkt_prohibit_out;
930                 rt->dst.input = ip6_pkt_prohibit;
931                 break;
932         case RTN_THROW:
933         case RTN_UNREACHABLE:
934         default:
935                 rt->dst.output = ip6_pkt_discard_out;
936                 rt->dst.input = ip6_pkt_discard;
937                 break;
938         }
939 }
940
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943         rt->dst.flags |= fib6_info_dst_flags(ort);
944
945         if (ort->fib6_flags & RTF_REJECT) {
946                 ip6_rt_init_dst_reject(rt, ort);
947                 return;
948         }
949
950         rt->dst.error = 0;
951         rt->dst.output = ip6_output;
952
953         if (ort->fib6_type == RTN_LOCAL) {
954                 rt->dst.input = ip6_input;
955         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956                 rt->dst.input = ip6_mc_input;
957         } else {
958                 rt->dst.input = ip6_forward;
959         }
960
961         if (ort->fib6_nh.nh_lwtstate) {
962                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963                 lwtunnel_set_redirect(&rt->dst);
964         }
965
966         rt->dst.lastuse = jiffies;
967 }
968
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971         rt->rt6i_flags &= ~RTF_EXPIRES;
972         fib6_info_hold(from);
973         rcu_assign_pointer(rt->from, from);
974         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975         if (from->fib6_metrics != &dst_default_metrics) {
976                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977                 refcount_inc(&from->fib6_metrics->refcnt);
978         }
979 }
980
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983         struct net_device *dev = fib6_info_nh_dev(ort);
984
985         ip6_rt_init_dst(rt, ort);
986
987         rt->rt6i_dst = ort->fib6_dst;
988         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990         rt->rt6i_flags = ort->fib6_flags;
991         rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993         rt->rt6i_src = ort->fib6_src;
994 #endif
995         rt->rt6i_prefsrc = ort->fib6_prefsrc;
996         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000                                         struct in6_addr *saddr)
1001 {
1002         struct fib6_node *pn, *sn;
1003         while (1) {
1004                 if (fn->fn_flags & RTN_TL_ROOT)
1005                         return NULL;
1006                 pn = rcu_dereference(fn->parent);
1007                 sn = FIB6_SUBTREE(pn);
1008                 if (sn && sn != fn)
1009                         fn = fib6_lookup(sn, NULL, saddr);
1010                 else
1011                         fn = pn;
1012                 if (fn->fn_flags & RTN_RTINFO)
1013                         return fn;
1014         }
1015 }
1016
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018                           bool null_fallback)
1019 {
1020         struct rt6_info *rt = *prt;
1021
1022         if (dst_hold_safe(&rt->dst))
1023                 return true;
1024         if (null_fallback) {
1025                 rt = net->ipv6.ip6_null_entry;
1026                 dst_hold(&rt->dst);
1027         } else {
1028                 rt = NULL;
1029         }
1030         *prt = rt;
1031         return false;
1032 }
1033
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037         unsigned short flags = fib6_info_dst_flags(rt);
1038         struct net_device *dev = rt->fib6_nh.nh_dev;
1039         struct rt6_info *nrt;
1040
1041         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042         if (nrt)
1043                 ip6_rt_copy_init(nrt, rt);
1044
1045         return nrt;
1046 }
1047
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049                                              struct fib6_table *table,
1050                                              struct flowi6 *fl6,
1051                                              const struct sk_buff *skb,
1052                                              int flags)
1053 {
1054         struct fib6_info *f6i;
1055         struct fib6_node *fn;
1056         struct rt6_info *rt;
1057
1058         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059                 flags &= ~RT6_LOOKUP_F_IFACE;
1060
1061         rcu_read_lock();
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064         f6i = rcu_dereference(fn->leaf);
1065         if (!f6i) {
1066                 f6i = net->ipv6.fib6_null_entry;
1067         } else {
1068                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069                                       fl6->flowi6_oif, flags);
1070                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071                         f6i = rt6_multipath_select(net, f6i, fl6,
1072                                                    fl6->flowi6_oif, skb, flags);
1073         }
1074         if (f6i == net->ipv6.fib6_null_entry) {
1075                 fn = fib6_backtrack(fn, &fl6->saddr);
1076                 if (fn)
1077                         goto restart;
1078         }
1079
1080         /* Search through exception table */
1081         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1082         if (rt) {
1083                 if (ip6_hold_safe(net, &rt, true))
1084                         dst_use_noref(&rt->dst, jiffies);
1085         } else if (f6i == net->ipv6.fib6_null_entry) {
1086                 rt = net->ipv6.ip6_null_entry;
1087                 dst_hold(&rt->dst);
1088         } else {
1089                 rt = ip6_create_rt_rcu(f6i);
1090                 if (!rt) {
1091                         rt = net->ipv6.ip6_null_entry;
1092                         dst_hold(&rt->dst);
1093                 }
1094         }
1095
1096         rcu_read_unlock();
1097
1098         trace_fib6_table_lookup(net, rt, table, fl6);
1099
1100         return rt;
1101 }
1102
1103 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1104                                    const struct sk_buff *skb, int flags)
1105 {
1106         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1109
1110 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1111                             const struct in6_addr *saddr, int oif,
1112                             const struct sk_buff *skb, int strict)
1113 {
1114         struct flowi6 fl6 = {
1115                 .flowi6_oif = oif,
1116                 .daddr = *daddr,
1117         };
1118         struct dst_entry *dst;
1119         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1120
1121         if (saddr) {
1122                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1124         }
1125
1126         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1127         if (dst->error == 0)
1128                 return (struct rt6_info *) dst;
1129
1130         dst_release(dst);
1131
1132         return NULL;
1133 }
1134 EXPORT_SYMBOL(rt6_lookup);
1135
1136 /* ip6_ins_rt is called with FREE table->tb6_lock.
1137  * It takes new route entry, the addition fails by any reason the
1138  * route is released.
1139  * Caller must hold dst before calling it.
1140  */
1141
1142 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143                         struct netlink_ext_ack *extack)
1144 {
1145         int err;
1146         struct fib6_table *table;
1147
1148         table = rt->fib6_table;
1149         spin_lock_bh(&table->tb6_lock);
1150         err = fib6_add(&table->tb6_root, rt, info, extack);
1151         spin_unlock_bh(&table->tb6_lock);
1152
1153         return err;
1154 }
1155
1156 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157 {
1158         struct nl_info info = { .nl_net = net, };
1159
1160         return __ip6_ins_rt(rt, &info, NULL);
1161 }
1162
1163 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164                                            const struct in6_addr *daddr,
1165                                            const struct in6_addr *saddr)
1166 {
1167         struct net_device *dev;
1168         struct rt6_info *rt;
1169
1170         /*
1171          *      Clone the route.
1172          */
1173
1174         dev = ip6_rt_get_dev_rcu(ort);
1175         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176         if (!rt)
1177                 return NULL;
1178
1179         ip6_rt_copy_init(rt, ort);
1180         rt->rt6i_flags |= RTF_CACHE;
1181         rt->dst.flags |= DST_HOST;
1182         rt->rt6i_dst.addr = *daddr;
1183         rt->rt6i_dst.plen = 128;
1184
1185         if (!rt6_is_gw_or_nonexthop(ort)) {
1186                 if (ort->fib6_dst.plen != 128 &&
1187                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1188                         rt->rt6i_flags |= RTF_ANYCAST;
1189 #ifdef CONFIG_IPV6_SUBTREES
1190                 if (rt->rt6i_src.plen && saddr) {
1191                         rt->rt6i_src.addr = *saddr;
1192                         rt->rt6i_src.plen = 128;
1193                 }
1194 #endif
1195         }
1196
1197         return rt;
1198 }
1199
1200 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1201 {
1202         unsigned short flags = fib6_info_dst_flags(rt);
1203         struct net_device *dev;
1204         struct rt6_info *pcpu_rt;
1205
1206         rcu_read_lock();
1207         dev = ip6_rt_get_dev_rcu(rt);
1208         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209         rcu_read_unlock();
1210         if (!pcpu_rt)
1211                 return NULL;
1212         ip6_rt_copy_init(pcpu_rt, rt);
1213         pcpu_rt->rt6i_flags |= RTF_PCPU;
1214         return pcpu_rt;
1215 }
1216
1217 /* It should be called with rcu_read_lock() acquired */
1218 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1219 {
1220         struct rt6_info *pcpu_rt, **p;
1221
1222         p = this_cpu_ptr(rt->rt6i_pcpu);
1223         pcpu_rt = *p;
1224
1225         if (pcpu_rt)
1226                 ip6_hold_safe(NULL, &pcpu_rt, false);
1227
1228         return pcpu_rt;
1229 }
1230
1231 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232                                             struct fib6_info *rt)
1233 {
1234         struct rt6_info *pcpu_rt, *prev, **p;
1235
1236         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1237         if (!pcpu_rt) {
1238                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1239                 return net->ipv6.ip6_null_entry;
1240         }
1241
1242         dst_hold(&pcpu_rt->dst);
1243         p = this_cpu_ptr(rt->rt6i_pcpu);
1244         prev = cmpxchg(p, NULL, pcpu_rt);
1245         BUG_ON(prev);
1246
1247         return pcpu_rt;
1248 }
1249
1250 /* exception hash table implementation
1251  */
1252 static DEFINE_SPINLOCK(rt6_exception_lock);
1253
1254 /* Remove rt6_ex from hash table and free the memory
1255  * Caller must hold rt6_exception_lock
1256  */
1257 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1258                                  struct rt6_exception *rt6_ex)
1259 {
1260         struct net *net;
1261
1262         if (!bucket || !rt6_ex)
1263                 return;
1264
1265         net = dev_net(rt6_ex->rt6i->dst.dev);
1266         hlist_del_rcu(&rt6_ex->hlist);
1267         dst_release(&rt6_ex->rt6i->dst);
1268         kfree_rcu(rt6_ex, rcu);
1269         WARN_ON_ONCE(!bucket->depth);
1270         bucket->depth--;
1271         net->ipv6.rt6_stats->fib_rt_cache--;
1272 }
1273
1274 /* Remove oldest rt6_ex in bucket and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1278 {
1279         struct rt6_exception *rt6_ex, *oldest = NULL;
1280
1281         if (!bucket)
1282                 return;
1283
1284         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1285                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1286                         oldest = rt6_ex;
1287         }
1288         rt6_remove_exception(bucket, oldest);
1289 }
1290
1291 static u32 rt6_exception_hash(const struct in6_addr *dst,
1292                               const struct in6_addr *src)
1293 {
1294         static u32 seed __read_mostly;
1295         u32 val;
1296
1297         net_get_random_once(&seed, sizeof(seed));
1298         val = jhash(dst, sizeof(*dst), seed);
1299
1300 #ifdef CONFIG_IPV6_SUBTREES
1301         if (src)
1302                 val = jhash(src, sizeof(*src), val);
1303 #endif
1304         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1305 }
1306
1307 /* Helper function to find the cached rt in the hash table
1308  * and update bucket pointer to point to the bucket for this
1309  * (daddr, saddr) pair
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static struct rt6_exception *
1313 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1314                               const struct in6_addr *daddr,
1315                               const struct in6_addr *saddr)
1316 {
1317         struct rt6_exception *rt6_ex;
1318         u32 hval;
1319
1320         if (!(*bucket) || !daddr)
1321                 return NULL;
1322
1323         hval = rt6_exception_hash(daddr, saddr);
1324         *bucket += hval;
1325
1326         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1327                 struct rt6_info *rt6 = rt6_ex->rt6i;
1328                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1329
1330 #ifdef CONFIG_IPV6_SUBTREES
1331                 if (matched && saddr)
1332                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1333 #endif
1334                 if (matched)
1335                         return rt6_ex;
1336         }
1337         return NULL;
1338 }
1339
1340 /* Helper function to find the cached rt in the hash table
1341  * and update bucket pointer to point to the bucket for this
1342  * (daddr, saddr) pair
1343  * Caller must hold rcu_read_lock()
1344  */
1345 static struct rt6_exception *
1346 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1347                          const struct in6_addr *daddr,
1348                          const struct in6_addr *saddr)
1349 {
1350         struct rt6_exception *rt6_ex;
1351         u32 hval;
1352
1353         WARN_ON_ONCE(!rcu_read_lock_held());
1354
1355         if (!(*bucket) || !daddr)
1356                 return NULL;
1357
1358         hval = rt6_exception_hash(daddr, saddr);
1359         *bucket += hval;
1360
1361         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1362                 struct rt6_info *rt6 = rt6_ex->rt6i;
1363                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1364
1365 #ifdef CONFIG_IPV6_SUBTREES
1366                 if (matched && saddr)
1367                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 #endif
1369                 if (matched)
1370                         return rt6_ex;
1371         }
1372         return NULL;
1373 }
1374
1375 static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 {
1377         unsigned int mtu;
1378
1379         if (rt->fib6_pmtu) {
1380                 mtu = rt->fib6_pmtu;
1381         } else {
1382                 struct net_device *dev = fib6_info_nh_dev(rt);
1383                 struct inet6_dev *idev;
1384
1385                 rcu_read_lock();
1386                 idev = __in6_dev_get(dev);
1387                 mtu = idev->cnf.mtu6;
1388                 rcu_read_unlock();
1389         }
1390
1391         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1392
1393         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1394 }
1395
1396 static int rt6_insert_exception(struct rt6_info *nrt,
1397                                 struct fib6_info *ort)
1398 {
1399         struct net *net = dev_net(nrt->dst.dev);
1400         struct rt6_exception_bucket *bucket;
1401         struct in6_addr *src_key = NULL;
1402         struct rt6_exception *rt6_ex;
1403         int err = 0;
1404
1405         spin_lock_bh(&rt6_exception_lock);
1406
1407         if (ort->exception_bucket_flushed) {
1408                 err = -EINVAL;
1409                 goto out;
1410         }
1411
1412         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1413                                         lockdep_is_held(&rt6_exception_lock));
1414         if (!bucket) {
1415                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416                                  GFP_ATOMIC);
1417                 if (!bucket) {
1418                         err = -ENOMEM;
1419                         goto out;
1420                 }
1421                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1422         }
1423
1424 #ifdef CONFIG_IPV6_SUBTREES
1425         /* rt6i_src.plen != 0 indicates ort is in subtree
1426          * and exception table is indexed by a hash of
1427          * both rt6i_dst and rt6i_src.
1428          * Otherwise, the exception table is indexed by
1429          * a hash of only rt6i_dst.
1430          */
1431         if (ort->fib6_src.plen)
1432                 src_key = &nrt->rt6i_src.addr;
1433 #endif
1434
1435         /* Update rt6i_prefsrc as it could be changed
1436          * in rt6_remove_prefsrc()
1437          */
1438         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439         /* rt6_mtu_change() might lower mtu on ort.
1440          * Only insert this exception route if its mtu
1441          * is less than ort's mtu value.
1442          */
1443         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444                 err = -EINVAL;
1445                 goto out;
1446         }
1447
1448         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1449                                                src_key);
1450         if (rt6_ex)
1451                 rt6_remove_exception(bucket, rt6_ex);
1452
1453         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454         if (!rt6_ex) {
1455                 err = -ENOMEM;
1456                 goto out;
1457         }
1458         rt6_ex->rt6i = nrt;
1459         rt6_ex->stamp = jiffies;
1460         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1461         bucket->depth++;
1462         net->ipv6.rt6_stats->fib_rt_cache++;
1463
1464         if (bucket->depth > FIB6_MAX_DEPTH)
1465                 rt6_exception_remove_oldest(bucket);
1466
1467 out:
1468         spin_unlock_bh(&rt6_exception_lock);
1469
1470         /* Update fn->fn_sernum to invalidate all cached dst */
1471         if (!err) {
1472                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1473                 fib6_update_sernum(net, ort);
1474                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475                 fib6_force_start_gc(net);
1476         }
1477
1478         return err;
1479 }
1480
1481 void rt6_flush_exceptions(struct fib6_info *rt)
1482 {
1483         struct rt6_exception_bucket *bucket;
1484         struct rt6_exception *rt6_ex;
1485         struct hlist_node *tmp;
1486         int i;
1487
1488         spin_lock_bh(&rt6_exception_lock);
1489         /* Prevent rt6_insert_exception() to recreate the bucket list */
1490         rt->exception_bucket_flushed = 1;
1491
1492         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1493                                     lockdep_is_held(&rt6_exception_lock));
1494         if (!bucket)
1495                 goto out;
1496
1497         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1498                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1499                         rt6_remove_exception(bucket, rt6_ex);
1500                 WARN_ON_ONCE(bucket->depth);
1501                 bucket++;
1502         }
1503
1504 out:
1505         spin_unlock_bh(&rt6_exception_lock);
1506 }
1507
1508 /* Find cached rt in the hash table inside passed in rt
1509  * Caller has to hold rcu_read_lock()
1510  */
1511 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512                                            struct in6_addr *daddr,
1513                                            struct in6_addr *saddr)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct in6_addr *src_key = NULL;
1517         struct rt6_exception *rt6_ex;
1518         struct rt6_info *res = NULL;
1519
1520         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1521
1522 #ifdef CONFIG_IPV6_SUBTREES
1523         /* rt6i_src.plen != 0 indicates rt is in subtree
1524          * and exception table is indexed by a hash of
1525          * both rt6i_dst and rt6i_src.
1526          * Otherwise, the exception table is indexed by
1527          * a hash of only rt6i_dst.
1528          */
1529         if (rt->fib6_src.plen)
1530                 src_key = saddr;
1531 #endif
1532         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1533
1534         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535                 res = rt6_ex->rt6i;
1536
1537         return res;
1538 }
1539
1540 /* Remove the passed in cached rt from the hash table that contains it */
1541 static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct fib6_info *from = rt->from;
1545         struct in6_addr *src_key = NULL;
1546         struct rt6_exception *rt6_ex;
1547         int err;
1548
1549         if (!from ||
1550             !(rt->rt6i_flags & RTF_CACHE))
1551                 return -EINVAL;
1552
1553         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1554                 return -ENOENT;
1555
1556         spin_lock_bh(&rt6_exception_lock);
1557         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1558                                     lockdep_is_held(&rt6_exception_lock));
1559 #ifdef CONFIG_IPV6_SUBTREES
1560         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1561          * and exception table is indexed by a hash of
1562          * both rt6i_dst and rt6i_src.
1563          * Otherwise, the exception table is indexed by
1564          * a hash of only rt6i_dst.
1565          */
1566         if (from->fib6_src.plen)
1567                 src_key = &rt->rt6i_src.addr;
1568 #endif
1569         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1570                                                &rt->rt6i_dst.addr,
1571                                                src_key);
1572         if (rt6_ex) {
1573                 rt6_remove_exception(bucket, rt6_ex);
1574                 err = 0;
1575         } else {
1576                 err = -ENOENT;
1577         }
1578
1579         spin_unlock_bh(&rt6_exception_lock);
1580         return err;
1581 }
1582
1583 /* Find rt6_ex which contains the passed in rt cache and
1584  * refresh its stamp
1585  */
1586 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1587 {
1588         struct rt6_exception_bucket *bucket;
1589         struct fib6_info *from = rt->from;
1590         struct in6_addr *src_key = NULL;
1591         struct rt6_exception *rt6_ex;
1592
1593         if (!from ||
1594             !(rt->rt6i_flags & RTF_CACHE))
1595                 return;
1596
1597         rcu_read_lock();
1598         bucket = rcu_dereference(from->rt6i_exception_bucket);
1599
1600 #ifdef CONFIG_IPV6_SUBTREES
1601         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1602          * and exception table is indexed by a hash of
1603          * both rt6i_dst and rt6i_src.
1604          * Otherwise, the exception table is indexed by
1605          * a hash of only rt6i_dst.
1606          */
1607         if (from->fib6_src.plen)
1608                 src_key = &rt->rt6i_src.addr;
1609 #endif
1610         rt6_ex = __rt6_find_exception_rcu(&bucket,
1611                                           &rt->rt6i_dst.addr,
1612                                           src_key);
1613         if (rt6_ex)
1614                 rt6_ex->stamp = jiffies;
1615
1616         rcu_read_unlock();
1617 }
1618
1619 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1620 {
1621         struct rt6_exception_bucket *bucket;
1622         struct rt6_exception *rt6_ex;
1623         int i;
1624
1625         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1626                                         lockdep_is_held(&rt6_exception_lock));
1627
1628         if (bucket) {
1629                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1630                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1631                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1632                         }
1633                         bucket++;
1634                 }
1635         }
1636 }
1637
1638 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1639                                          struct rt6_info *rt, int mtu)
1640 {
1641         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1642          * lowest MTU in the path: always allow updating the route PMTU to
1643          * reflect PMTU decreases.
1644          *
1645          * If the new MTU is higher, and the route PMTU is equal to the local
1646          * MTU, this means the old MTU is the lowest in the path, so allow
1647          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1648          * handle this.
1649          */
1650
1651         if (dst_mtu(&rt->dst) >= mtu)
1652                 return true;
1653
1654         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1655                 return true;
1656
1657         return false;
1658 }
1659
1660 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1661                                        struct fib6_info *rt, int mtu)
1662 {
1663         struct rt6_exception_bucket *bucket;
1664         struct rt6_exception *rt6_ex;
1665         int i;
1666
1667         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1668                                         lockdep_is_held(&rt6_exception_lock));
1669
1670         if (!bucket)
1671                 return;
1672
1673         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1674                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1675                         struct rt6_info *entry = rt6_ex->rt6i;
1676
1677                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1678                          * route), the metrics of its rt->from have already
1679                          * been updated.
1680                          */
1681                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1682                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1683                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1684                 }
1685                 bucket++;
1686         }
1687 }
1688
1689 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1690
1691 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1692                                         struct in6_addr *gateway)
1693 {
1694         struct rt6_exception_bucket *bucket;
1695         struct rt6_exception *rt6_ex;
1696         struct hlist_node *tmp;
1697         int i;
1698
1699         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1700                 return;
1701
1702         spin_lock_bh(&rt6_exception_lock);
1703         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1704                                      lockdep_is_held(&rt6_exception_lock));
1705
1706         if (bucket) {
1707                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1708                         hlist_for_each_entry_safe(rt6_ex, tmp,
1709                                                   &bucket->chain, hlist) {
1710                                 struct rt6_info *entry = rt6_ex->rt6i;
1711
1712                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1713                                     RTF_CACHE_GATEWAY &&
1714                                     ipv6_addr_equal(gateway,
1715                                                     &entry->rt6i_gateway)) {
1716                                         rt6_remove_exception(bucket, rt6_ex);
1717                                 }
1718                         }
1719                         bucket++;
1720                 }
1721         }
1722
1723         spin_unlock_bh(&rt6_exception_lock);
1724 }
1725
1726 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1727                                       struct rt6_exception *rt6_ex,
1728                                       struct fib6_gc_args *gc_args,
1729                                       unsigned long now)
1730 {
1731         struct rt6_info *rt = rt6_ex->rt6i;
1732
1733         /* we are pruning and obsoleting aged-out and non gateway exceptions
1734          * even if others have still references to them, so that on next
1735          * dst_check() such references can be dropped.
1736          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1737          * expired, independently from their aging, as per RFC 8201 section 4
1738          */
1739         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1740                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1741                         RT6_TRACE("aging clone %p\n", rt);
1742                         rt6_remove_exception(bucket, rt6_ex);
1743                         return;
1744                 }
1745         } else if (time_after(jiffies, rt->dst.expires)) {
1746                 RT6_TRACE("purging expired route %p\n", rt);
1747                 rt6_remove_exception(bucket, rt6_ex);
1748                 return;
1749         }
1750
1751         if (rt->rt6i_flags & RTF_GATEWAY) {
1752                 struct neighbour *neigh;
1753                 __u8 neigh_flags = 0;
1754
1755                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1756                 if (neigh)
1757                         neigh_flags = neigh->flags;
1758
1759                 if (!(neigh_flags & NTF_ROUTER)) {
1760                         RT6_TRACE("purging route %p via non-router but gateway\n",
1761                                   rt);
1762                         rt6_remove_exception(bucket, rt6_ex);
1763                         return;
1764                 }
1765         }
1766
1767         gc_args->more++;
1768 }
1769
1770 void rt6_age_exceptions(struct fib6_info *rt,
1771                         struct fib6_gc_args *gc_args,
1772                         unsigned long now)
1773 {
1774         struct rt6_exception_bucket *bucket;
1775         struct rt6_exception *rt6_ex;
1776         struct hlist_node *tmp;
1777         int i;
1778
1779         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1780                 return;
1781
1782         rcu_read_lock_bh();
1783         spin_lock(&rt6_exception_lock);
1784         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1785                                     lockdep_is_held(&rt6_exception_lock));
1786
1787         if (bucket) {
1788                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1789                         hlist_for_each_entry_safe(rt6_ex, tmp,
1790                                                   &bucket->chain, hlist) {
1791                                 rt6_age_examine_exception(bucket, rt6_ex,
1792                                                           gc_args, now);
1793                         }
1794                         bucket++;
1795                 }
1796         }
1797         spin_unlock(&rt6_exception_lock);
1798         rcu_read_unlock_bh();
1799 }
1800
1801 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1802                                int oif, struct flowi6 *fl6,
1803                                const struct sk_buff *skb, int flags)
1804 {
1805         struct fib6_node *fn, *saved_fn;
1806         struct fib6_info *f6i;
1807         struct rt6_info *rt;
1808         int strict = 0;
1809
1810         strict |= flags & RT6_LOOKUP_F_IFACE;
1811         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1812         if (net->ipv6.devconf_all->forwarding == 0)
1813                 strict |= RT6_LOOKUP_F_REACHABLE;
1814
1815         rcu_read_lock();
1816
1817         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1818         saved_fn = fn;
1819
1820         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821                 oif = 0;
1822
1823 redo_rt6_select:
1824         f6i = rt6_select(net, fn, oif, strict);
1825         if (f6i->fib6_nsiblings)
1826                 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1827         if (f6i == net->ipv6.fib6_null_entry) {
1828                 fn = fib6_backtrack(fn, &fl6->saddr);
1829                 if (fn)
1830                         goto redo_rt6_select;
1831                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1832                         /* also consider unreachable route */
1833                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1834                         fn = saved_fn;
1835                         goto redo_rt6_select;
1836                 }
1837         }
1838
1839         if (f6i == net->ipv6.fib6_null_entry) {
1840                 rt = net->ipv6.ip6_null_entry;
1841                 rcu_read_unlock();
1842                 dst_hold(&rt->dst);
1843                 trace_fib6_table_lookup(net, rt, table, fl6);
1844                 return rt;
1845         }
1846
1847         /*Search through exception table */
1848         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1849         if (rt) {
1850                 if (ip6_hold_safe(net, &rt, true))
1851                         dst_use_noref(&rt->dst, jiffies);
1852
1853                 rcu_read_unlock();
1854                 trace_fib6_table_lookup(net, rt, table, fl6);
1855                 return rt;
1856         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1857                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1858                 /* Create a RTF_CACHE clone which will not be
1859                  * owned by the fib6 tree.  It is for the special case where
1860                  * the daddr in the skb during the neighbor look-up is different
1861                  * from the fl6->daddr used to look-up route here.
1862                  */
1863                 struct rt6_info *uncached_rt;
1864
1865                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1866
1867                 rcu_read_unlock();
1868
1869                 if (uncached_rt) {
1870                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1871                          * No need for another dst_hold()
1872                          */
1873                         rt6_uncached_list_add(uncached_rt);
1874                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1875                 } else {
1876                         uncached_rt = net->ipv6.ip6_null_entry;
1877                         dst_hold(&uncached_rt->dst);
1878                 }
1879
1880                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1881                 return uncached_rt;
1882
1883         } else {
1884                 /* Get a percpu copy */
1885
1886                 struct rt6_info *pcpu_rt;
1887
1888                 local_bh_disable();
1889                 pcpu_rt = rt6_get_pcpu_route(f6i);
1890
1891                 if (!pcpu_rt)
1892                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1893
1894                 local_bh_enable();
1895                 rcu_read_unlock();
1896                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1897                 return pcpu_rt;
1898         }
1899 }
1900 EXPORT_SYMBOL_GPL(ip6_pol_route);
1901
1902 static struct rt6_info *ip6_pol_route_input(struct net *net,
1903                                             struct fib6_table *table,
1904                                             struct flowi6 *fl6,
1905                                             const struct sk_buff *skb,
1906                                             int flags)
1907 {
1908         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1909 }
1910
1911 struct dst_entry *ip6_route_input_lookup(struct net *net,
1912                                          struct net_device *dev,
1913                                          struct flowi6 *fl6,
1914                                          const struct sk_buff *skb,
1915                                          int flags)
1916 {
1917         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1918                 flags |= RT6_LOOKUP_F_IFACE;
1919
1920         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1921 }
1922 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1923
1924 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1925                                   struct flow_keys *keys,
1926                                   struct flow_keys *flkeys)
1927 {
1928         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1929         const struct ipv6hdr *key_iph = outer_iph;
1930         struct flow_keys *_flkeys = flkeys;
1931         const struct ipv6hdr *inner_iph;
1932         const struct icmp6hdr *icmph;
1933         struct ipv6hdr _inner_iph;
1934
1935         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1936                 goto out;
1937
1938         icmph = icmp6_hdr(skb);
1939         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1940             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1941             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1942             icmph->icmp6_type != ICMPV6_PARAMPROB)
1943                 goto out;
1944
1945         inner_iph = skb_header_pointer(skb,
1946                                        skb_transport_offset(skb) + sizeof(*icmph),
1947                                        sizeof(_inner_iph), &_inner_iph);
1948         if (!inner_iph)
1949                 goto out;
1950
1951         key_iph = inner_iph;
1952         _flkeys = NULL;
1953 out:
1954         if (_flkeys) {
1955                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1956                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1957                 keys->tags.flow_label = _flkeys->tags.flow_label;
1958                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1959         } else {
1960                 keys->addrs.v6addrs.src = key_iph->saddr;
1961                 keys->addrs.v6addrs.dst = key_iph->daddr;
1962                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1963                 keys->basic.ip_proto = key_iph->nexthdr;
1964         }
1965 }
1966
1967 /* if skb is set it will be used and fl6 can be NULL */
1968 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1969                        const struct sk_buff *skb, struct flow_keys *flkeys)
1970 {
1971         struct flow_keys hash_keys;
1972         u32 mhash;
1973
1974         switch (ip6_multipath_hash_policy(net)) {
1975         case 0:
1976                 memset(&hash_keys, 0, sizeof(hash_keys));
1977                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1978                 if (skb) {
1979                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1980                 } else {
1981                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1982                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1983                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1984                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1985                 }
1986                 break;
1987         case 1:
1988                 if (skb) {
1989                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1990                         struct flow_keys keys;
1991
1992                         /* short-circuit if we already have L4 hash present */
1993                         if (skb->l4_hash)
1994                                 return skb_get_hash_raw(skb) >> 1;
1995
1996                         memset(&hash_keys, 0, sizeof(hash_keys));
1997
1998                         if (!flkeys) {
1999                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2000                                 flkeys = &keys;
2001                         }
2002                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2003                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2004                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2005                         hash_keys.ports.src = flkeys->ports.src;
2006                         hash_keys.ports.dst = flkeys->ports.dst;
2007                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2008                 } else {
2009                         memset(&hash_keys, 0, sizeof(hash_keys));
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2012                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2013                         hash_keys.ports.src = fl6->fl6_sport;
2014                         hash_keys.ports.dst = fl6->fl6_dport;
2015                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2016                 }
2017                 break;
2018         }
2019         mhash = flow_hash_from_keys(&hash_keys);
2020
2021         return mhash >> 1;
2022 }
2023
2024 void ip6_route_input(struct sk_buff *skb)
2025 {
2026         const struct ipv6hdr *iph = ipv6_hdr(skb);
2027         struct net *net = dev_net(skb->dev);
2028         int flags = RT6_LOOKUP_F_HAS_SADDR;
2029         struct ip_tunnel_info *tun_info;
2030         struct flowi6 fl6 = {
2031                 .flowi6_iif = skb->dev->ifindex,
2032                 .daddr = iph->daddr,
2033                 .saddr = iph->saddr,
2034                 .flowlabel = ip6_flowinfo(iph),
2035                 .flowi6_mark = skb->mark,
2036                 .flowi6_proto = iph->nexthdr,
2037         };
2038         struct flow_keys *flkeys = NULL, _flkeys;
2039
2040         tun_info = skb_tunnel_info(skb);
2041         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2042                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2043
2044         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2045                 flkeys = &_flkeys;
2046
2047         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2048                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2049         skb_dst_drop(skb);
2050         skb_dst_set(skb,
2051                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2052 }
2053
2054 static struct rt6_info *ip6_pol_route_output(struct net *net,
2055                                              struct fib6_table *table,
2056                                              struct flowi6 *fl6,
2057                                              const struct sk_buff *skb,
2058                                              int flags)
2059 {
2060         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2061 }
2062
2063 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2064                                          struct flowi6 *fl6, int flags)
2065 {
2066         bool any_src;
2067
2068         if (rt6_need_strict(&fl6->daddr)) {
2069                 struct dst_entry *dst;
2070
2071                 dst = l3mdev_link_scope_lookup(net, fl6);
2072                 if (dst)
2073                         return dst;
2074         }
2075
2076         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2077
2078         any_src = ipv6_addr_any(&fl6->saddr);
2079         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2080             (fl6->flowi6_oif && any_src))
2081                 flags |= RT6_LOOKUP_F_IFACE;
2082
2083         if (!any_src)
2084                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2085         else if (sk)
2086                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2087
2088         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2089 }
2090 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2091
2092 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2093 {
2094         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2095         struct net_device *loopback_dev = net->loopback_dev;
2096         struct dst_entry *new = NULL;
2097
2098         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2099                        DST_OBSOLETE_DEAD, 0);
2100         if (rt) {
2101                 rt6_info_init(rt);
2102                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2103
2104                 new = &rt->dst;
2105                 new->__use = 1;
2106                 new->input = dst_discard;
2107                 new->output = dst_discard_out;
2108
2109                 dst_copy_metrics(new, &ort->dst);
2110
2111                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2112                 rt->rt6i_gateway = ort->rt6i_gateway;
2113                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2114
2115                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2116 #ifdef CONFIG_IPV6_SUBTREES
2117                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2118 #endif
2119         }
2120
2121         dst_release(dst_orig);
2122         return new ? new : ERR_PTR(-ENOMEM);
2123 }
2124
2125 /*
2126  *      Destination cache support functions
2127  */
2128
2129 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2130 {
2131         u32 rt_cookie = 0;
2132
2133         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2134                 return false;
2135
2136         if (fib6_check_expired(f6i))
2137                 return false;
2138
2139         return true;
2140 }
2141
2142 static struct dst_entry *rt6_check(struct rt6_info *rt,
2143                                    struct fib6_info *from,
2144                                    u32 cookie)
2145 {
2146         u32 rt_cookie = 0;
2147
2148         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2149             rt_cookie != cookie)
2150                 return NULL;
2151
2152         if (rt6_check_expired(rt))
2153                 return NULL;
2154
2155         return &rt->dst;
2156 }
2157
2158 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2159                                             struct fib6_info *from,
2160                                             u32 cookie)
2161 {
2162         if (!__rt6_check_expired(rt) &&
2163             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2164             fib6_check(from, cookie))
2165                 return &rt->dst;
2166         else
2167                 return NULL;
2168 }
2169
2170 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2171 {
2172         struct dst_entry *dst_ret;
2173         struct fib6_info *from;
2174         struct rt6_info *rt;
2175
2176         rt = container_of(dst, struct rt6_info, dst);
2177
2178         rcu_read_lock();
2179
2180         /* All IPV6 dsts are created with ->obsolete set to the value
2181          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2182          * into this function always.
2183          */
2184
2185         from = rcu_dereference(rt->from);
2186
2187         if (from && (rt->rt6i_flags & RTF_PCPU ||
2188             unlikely(!list_empty(&rt->rt6i_uncached))))
2189                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2190         else
2191                 dst_ret = rt6_check(rt, from, cookie);
2192
2193         rcu_read_unlock();
2194
2195         return dst_ret;
2196 }
2197
2198 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2199 {
2200         struct rt6_info *rt = (struct rt6_info *) dst;
2201
2202         if (rt) {
2203                 if (rt->rt6i_flags & RTF_CACHE) {
2204                         rcu_read_lock();
2205                         if (rt6_check_expired(rt)) {
2206                                 rt6_remove_exception_rt(rt);
2207                                 dst = NULL;
2208                         }
2209                         rcu_read_unlock();
2210                 } else {
2211                         dst_release(dst);
2212                         dst = NULL;
2213                 }
2214         }
2215         return dst;
2216 }
2217
2218 static void ip6_link_failure(struct sk_buff *skb)
2219 {
2220         struct rt6_info *rt;
2221
2222         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2223
2224         rt = (struct rt6_info *) skb_dst(skb);
2225         if (rt) {
2226                 if (rt->rt6i_flags & RTF_CACHE) {
2227                         if (dst_hold_safe(&rt->dst))
2228                                 rt6_remove_exception_rt(rt);
2229                 } else {
2230                         struct fib6_info *from;
2231                         struct fib6_node *fn;
2232
2233                         rcu_read_lock();
2234                         from = rcu_dereference(rt->from);
2235                         if (from) {
2236                                 fn = rcu_dereference(from->fib6_node);
2237                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2238                                         fn->fn_sernum = -1;
2239                         }
2240                         rcu_read_unlock();
2241                 }
2242         }
2243 }
2244
2245 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2246 {
2247         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2248                 struct fib6_info *from;
2249
2250                 rcu_read_lock();
2251                 from = rcu_dereference(rt0->from);
2252                 if (from)
2253                         rt0->dst.expires = from->expires;
2254                 rcu_read_unlock();
2255         }
2256
2257         dst_set_expires(&rt0->dst, timeout);
2258         rt0->rt6i_flags |= RTF_EXPIRES;
2259 }
2260
2261 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2262 {
2263         struct net *net = dev_net(rt->dst.dev);
2264
2265         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2266         rt->rt6i_flags |= RTF_MODIFIED;
2267         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2268 }
2269
2270 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2271 {
2272         bool from_set;
2273
2274         rcu_read_lock();
2275         from_set = !!rcu_dereference(rt->from);
2276         rcu_read_unlock();
2277
2278         return !(rt->rt6i_flags & RTF_CACHE) &&
2279                 (rt->rt6i_flags & RTF_PCPU || from_set);
2280 }
2281
2282 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2283                                  const struct ipv6hdr *iph, u32 mtu)
2284 {
2285         const struct in6_addr *daddr, *saddr;
2286         struct rt6_info *rt6 = (struct rt6_info *)dst;
2287
2288         if (rt6->rt6i_flags & RTF_LOCAL)
2289                 return;
2290
2291         if (dst_metric_locked(dst, RTAX_MTU))
2292                 return;
2293
2294         if (iph) {
2295                 daddr = &iph->daddr;
2296                 saddr = &iph->saddr;
2297         } else if (sk) {
2298                 daddr = &sk->sk_v6_daddr;
2299                 saddr = &inet6_sk(sk)->saddr;
2300         } else {
2301                 daddr = NULL;
2302                 saddr = NULL;
2303         }
2304         dst_confirm_neigh(dst, daddr);
2305         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2306         if (mtu >= dst_mtu(dst))
2307                 return;
2308
2309         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310                 rt6_do_update_pmtu(rt6, mtu);
2311                 /* update rt6_ex->stamp for cache */
2312                 if (rt6->rt6i_flags & RTF_CACHE)
2313                         rt6_update_exception_stamp_rt(rt6);
2314         } else if (daddr) {
2315                 struct fib6_info *from;
2316                 struct rt6_info *nrt6;
2317
2318                 rcu_read_lock();
2319                 from = rcu_dereference(rt6->from);
2320                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2321                 if (nrt6) {
2322                         rt6_do_update_pmtu(nrt6, mtu);
2323                         if (rt6_insert_exception(nrt6, from))
2324                                 dst_release_immediate(&nrt6->dst);
2325                 }
2326                 rcu_read_unlock();
2327         }
2328 }
2329
2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2331                                struct sk_buff *skb, u32 mtu)
2332 {
2333         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2334 }
2335
2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337                      int oif, u32 mark, kuid_t uid)
2338 {
2339         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2340         struct dst_entry *dst;
2341         struct flowi6 fl6;
2342
2343         memset(&fl6, 0, sizeof(fl6));
2344         fl6.flowi6_oif = oif;
2345         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2346         fl6.daddr = iph->daddr;
2347         fl6.saddr = iph->saddr;
2348         fl6.flowlabel = ip6_flowinfo(iph);
2349         fl6.flowi6_uid = uid;
2350
2351         dst = ip6_route_output(net, NULL, &fl6);
2352         if (!dst->error)
2353                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2354         dst_release(dst);
2355 }
2356 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2357
2358 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2359 {
2360         struct dst_entry *dst;
2361
2362         ip6_update_pmtu(skb, sock_net(sk), mtu,
2363                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2364
2365         dst = __sk_dst_get(sk);
2366         if (!dst || !dst->obsolete ||
2367             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2368                 return;
2369
2370         bh_lock_sock(sk);
2371         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2372                 ip6_datagram_dst_update(sk, false);
2373         bh_unlock_sock(sk);
2374 }
2375 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2376
2377 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2378                            const struct flowi6 *fl6)
2379 {
2380 #ifdef CONFIG_IPV6_SUBTREES
2381         struct ipv6_pinfo *np = inet6_sk(sk);
2382 #endif
2383
2384         ip6_dst_store(sk, dst,
2385                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2386                       &sk->sk_v6_daddr : NULL,
2387 #ifdef CONFIG_IPV6_SUBTREES
2388                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2389                       &np->saddr :
2390 #endif
2391                       NULL);
2392 }
2393
2394 /* Handle redirects */
2395 struct ip6rd_flowi {
2396         struct flowi6 fl6;
2397         struct in6_addr gateway;
2398 };
2399
2400 static struct rt6_info *__ip6_route_redirect(struct net *net,
2401                                              struct fib6_table *table,
2402                                              struct flowi6 *fl6,
2403                                              const struct sk_buff *skb,
2404                                              int flags)
2405 {
2406         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2407         struct rt6_info *ret = NULL, *rt_cache;
2408         struct fib6_info *rt;
2409         struct fib6_node *fn;
2410
2411         /* Get the "current" route for this destination and
2412          * check if the redirect has come from appropriate router.
2413          *
2414          * RFC 4861 specifies that redirects should only be
2415          * accepted if they come from the nexthop to the target.
2416          * Due to the way the routes are chosen, this notion
2417          * is a bit fuzzy and one might need to check all possible
2418          * routes.
2419          */
2420
2421         rcu_read_lock();
2422         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2423 restart:
2424         for_each_fib6_node_rt_rcu(fn) {
2425                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2426                         continue;
2427                 if (fib6_check_expired(rt))
2428                         continue;
2429                 if (rt->fib6_flags & RTF_REJECT)
2430                         break;
2431                 if (!(rt->fib6_flags & RTF_GATEWAY))
2432                         continue;
2433                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2434                         continue;
2435                 /* rt_cache's gateway might be different from its 'parent'
2436                  * in the case of an ip redirect.
2437                  * So we keep searching in the exception table if the gateway
2438                  * is different.
2439                  */
2440                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2441                         rt_cache = rt6_find_cached_rt(rt,
2442                                                       &fl6->daddr,
2443                                                       &fl6->saddr);
2444                         if (rt_cache &&
2445                             ipv6_addr_equal(&rdfl->gateway,
2446                                             &rt_cache->rt6i_gateway)) {
2447                                 ret = rt_cache;
2448                                 break;
2449                         }
2450                         continue;
2451                 }
2452                 break;
2453         }
2454
2455         if (!rt)
2456                 rt = net->ipv6.fib6_null_entry;
2457         else if (rt->fib6_flags & RTF_REJECT) {
2458                 ret = net->ipv6.ip6_null_entry;
2459                 goto out;
2460         }
2461
2462         if (rt == net->ipv6.fib6_null_entry) {
2463                 fn = fib6_backtrack(fn, &fl6->saddr);
2464                 if (fn)
2465                         goto restart;
2466         }
2467
2468 out:
2469         if (ret)
2470                 dst_hold(&ret->dst);
2471         else
2472                 ret = ip6_create_rt_rcu(rt);
2473
2474         rcu_read_unlock();
2475
2476         trace_fib6_table_lookup(net, ret, table, fl6);
2477         return ret;
2478 };
2479
2480 static struct dst_entry *ip6_route_redirect(struct net *net,
2481                                             const struct flowi6 *fl6,
2482                                             const struct sk_buff *skb,
2483                                             const struct in6_addr *gateway)
2484 {
2485         int flags = RT6_LOOKUP_F_HAS_SADDR;
2486         struct ip6rd_flowi rdfl;
2487
2488         rdfl.fl6 = *fl6;
2489         rdfl.gateway = *gateway;
2490
2491         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2492                                 flags, __ip6_route_redirect);
2493 }
2494
2495 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2496                   kuid_t uid)
2497 {
2498         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2499         struct dst_entry *dst;
2500         struct flowi6 fl6;
2501
2502         memset(&fl6, 0, sizeof(fl6));
2503         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2504         fl6.flowi6_oif = oif;
2505         fl6.flowi6_mark = mark;
2506         fl6.daddr = iph->daddr;
2507         fl6.saddr = iph->saddr;
2508         fl6.flowlabel = ip6_flowinfo(iph);
2509         fl6.flowi6_uid = uid;
2510
2511         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2512         rt6_do_redirect(dst, NULL, skb);
2513         dst_release(dst);
2514 }
2515 EXPORT_SYMBOL_GPL(ip6_redirect);
2516
2517 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2518                             u32 mark)
2519 {
2520         const struct ipv6hdr *iph = ipv6_hdr(skb);
2521         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2522         struct dst_entry *dst;
2523         struct flowi6 fl6;
2524
2525         memset(&fl6, 0, sizeof(fl6));
2526         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2527         fl6.flowi6_oif = oif;
2528         fl6.flowi6_mark = mark;
2529         fl6.daddr = msg->dest;
2530         fl6.saddr = iph->daddr;
2531         fl6.flowi6_uid = sock_net_uid(net, NULL);
2532
2533         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2534         rt6_do_redirect(dst, NULL, skb);
2535         dst_release(dst);
2536 }
2537
2538 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2539 {
2540         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2541                      sk->sk_uid);
2542 }
2543 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2544
2545 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2546 {
2547         struct net_device *dev = dst->dev;
2548         unsigned int mtu = dst_mtu(dst);
2549         struct net *net = dev_net(dev);
2550
2551         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2552
2553         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2554                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2555
2556         /*
2557          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2558          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2559          * IPV6_MAXPLEN is also valid and means: "any MSS,
2560          * rely only on pmtu discovery"
2561          */
2562         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2563                 mtu = IPV6_MAXPLEN;
2564         return mtu;
2565 }
2566
2567 static unsigned int ip6_mtu(const struct dst_entry *dst)
2568 {
2569         struct inet6_dev *idev;
2570         unsigned int mtu;
2571
2572         mtu = dst_metric_raw(dst, RTAX_MTU);
2573         if (mtu)
2574                 goto out;
2575
2576         mtu = IPV6_MIN_MTU;
2577
2578         rcu_read_lock();
2579         idev = __in6_dev_get(dst->dev);
2580         if (idev)
2581                 mtu = idev->cnf.mtu6;
2582         rcu_read_unlock();
2583
2584 out:
2585         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2586
2587         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2588 }
2589
2590 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2591                                   struct flowi6 *fl6)
2592 {
2593         struct dst_entry *dst;
2594         struct rt6_info *rt;
2595         struct inet6_dev *idev = in6_dev_get(dev);
2596         struct net *net = dev_net(dev);
2597
2598         if (unlikely(!idev))
2599                 return ERR_PTR(-ENODEV);
2600
2601         rt = ip6_dst_alloc(net, dev, 0);
2602         if (unlikely(!rt)) {
2603                 in6_dev_put(idev);
2604                 dst = ERR_PTR(-ENOMEM);
2605                 goto out;
2606         }
2607
2608         rt->dst.flags |= DST_HOST;
2609         rt->dst.input = ip6_input;
2610         rt->dst.output  = ip6_output;
2611         rt->rt6i_gateway  = fl6->daddr;
2612         rt->rt6i_dst.addr = fl6->daddr;
2613         rt->rt6i_dst.plen = 128;
2614         rt->rt6i_idev     = idev;
2615         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2616
2617         /* Add this dst into uncached_list so that rt6_disable_ip() can
2618          * do proper release of the net_device
2619          */
2620         rt6_uncached_list_add(rt);
2621         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2622
2623         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2624
2625 out:
2626         return dst;
2627 }
2628
2629 static int ip6_dst_gc(struct dst_ops *ops)
2630 {
2631         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2632         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2633         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2634         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2635         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2636         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2637         int entries;
2638
2639         entries = dst_entries_get_fast(ops);
2640         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2641             entries <= rt_max_size)
2642                 goto out;
2643
2644         net->ipv6.ip6_rt_gc_expire++;
2645         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2646         entries = dst_entries_get_slow(ops);
2647         if (entries < ops->gc_thresh)
2648                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2649 out:
2650         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2651         return entries > rt_max_size;
2652 }
2653
2654 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2655                                struct fib6_config *cfg)
2656 {
2657         struct dst_metrics *p;
2658
2659         if (!cfg->fc_mx)
2660                 return 0;
2661
2662         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2663         if (unlikely(!p))
2664                 return -ENOMEM;
2665
2666         refcount_set(&p->refcnt, 1);
2667         rt->fib6_metrics = p;
2668
2669         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2670 }
2671
2672 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2673                                             struct fib6_config *cfg,
2674                                             const struct in6_addr *gw_addr,
2675                                             u32 tbid, int flags)
2676 {
2677         struct flowi6 fl6 = {
2678                 .flowi6_oif = cfg->fc_ifindex,
2679                 .daddr = *gw_addr,
2680                 .saddr = cfg->fc_prefsrc,
2681         };
2682         struct fib6_table *table;
2683         struct rt6_info *rt;
2684
2685         table = fib6_get_table(net, tbid);
2686         if (!table)
2687                 return NULL;
2688
2689         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2690                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2691
2692         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2693         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2694
2695         /* if table lookup failed, fall back to full lookup */
2696         if (rt == net->ipv6.ip6_null_entry) {
2697                 ip6_rt_put(rt);
2698                 rt = NULL;
2699         }
2700
2701         return rt;
2702 }
2703
2704 static int ip6_route_check_nh_onlink(struct net *net,
2705                                      struct fib6_config *cfg,
2706                                      const struct net_device *dev,
2707                                      struct netlink_ext_ack *extack)
2708 {
2709         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2710         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2711         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2712         struct rt6_info *grt;
2713         int err;
2714
2715         err = 0;
2716         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2717         if (grt) {
2718                 if (!grt->dst.error &&
2719                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2720                         NL_SET_ERR_MSG(extack,
2721                                        "Nexthop has invalid gateway or device mismatch");
2722                         err = -EINVAL;
2723                 }
2724
2725                 ip6_rt_put(grt);
2726         }
2727
2728         return err;
2729 }
2730
2731 static int ip6_route_check_nh(struct net *net,
2732                               struct fib6_config *cfg,
2733                               struct net_device **_dev,
2734                               struct inet6_dev **idev)
2735 {
2736         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2737         struct net_device *dev = _dev ? *_dev : NULL;
2738         struct rt6_info *grt = NULL;
2739         int err = -EHOSTUNREACH;
2740
2741         if (cfg->fc_table) {
2742                 int flags = RT6_LOOKUP_F_IFACE;
2743
2744                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2745                                           cfg->fc_table, flags);
2746                 if (grt) {
2747                         if (grt->rt6i_flags & RTF_GATEWAY ||
2748                             (dev && dev != grt->dst.dev)) {
2749                                 ip6_rt_put(grt);
2750                                 grt = NULL;
2751                         }
2752                 }
2753         }
2754
2755         if (!grt)
2756                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2757
2758         if (!grt)
2759                 goto out;
2760
2761         if (dev) {
2762                 if (dev != grt->dst.dev) {
2763                         ip6_rt_put(grt);
2764                         goto out;
2765                 }
2766         } else {
2767                 *_dev = dev = grt->dst.dev;
2768                 *idev = grt->rt6i_idev;
2769                 dev_hold(dev);
2770                 in6_dev_hold(grt->rt6i_idev);
2771         }
2772
2773         if (!(grt->rt6i_flags & RTF_GATEWAY))
2774                 err = 0;
2775
2776         ip6_rt_put(grt);
2777
2778 out:
2779         return err;
2780 }
2781
2782 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2783                            struct net_device **_dev, struct inet6_dev **idev,
2784                            struct netlink_ext_ack *extack)
2785 {
2786         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2787         int gwa_type = ipv6_addr_type(gw_addr);
2788         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2789         const struct net_device *dev = *_dev;
2790         bool need_addr_check = !dev;
2791         int err = -EINVAL;
2792
2793         /* if gw_addr is local we will fail to detect this in case
2794          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2795          * will return already-added prefix route via interface that
2796          * prefix route was assigned to, which might be non-loopback.
2797          */
2798         if (dev &&
2799             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2800                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2801                 goto out;
2802         }
2803
2804         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2805                 /* IPv6 strictly inhibits using not link-local
2806                  * addresses as nexthop address.
2807                  * Otherwise, router will not able to send redirects.
2808                  * It is very good, but in some (rare!) circumstances
2809                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2810                  * some exceptions. --ANK
2811                  * We allow IPv4-mapped nexthops to support RFC4798-type
2812                  * addressing
2813                  */
2814                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2815                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2816                         goto out;
2817                 }
2818
2819                 if (cfg->fc_flags & RTNH_F_ONLINK)
2820                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2821                 else
2822                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2823
2824                 if (err)
2825                         goto out;
2826         }
2827
2828         /* reload in case device was changed */
2829         dev = *_dev;
2830
2831         err = -EINVAL;
2832         if (!dev) {
2833                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2834                 goto out;
2835         } else if (dev->flags & IFF_LOOPBACK) {
2836                 NL_SET_ERR_MSG(extack,
2837                                "Egress device can not be loopback device for this route");
2838                 goto out;
2839         }
2840
2841         /* if we did not check gw_addr above, do so now that the
2842          * egress device has been resolved.
2843          */
2844         if (need_addr_check &&
2845             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2846                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2847                 goto out;
2848         }
2849
2850         err = 0;
2851 out:
2852         return err;
2853 }
2854
2855 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2856                                               gfp_t gfp_flags,
2857                                               struct netlink_ext_ack *extack)
2858 {
2859         struct net *net = cfg->fc_nlinfo.nl_net;
2860         struct fib6_info *rt = NULL;
2861         struct net_device *dev = NULL;
2862         struct inet6_dev *idev = NULL;
2863         struct fib6_table *table;
2864         int addr_type;
2865         int err = -EINVAL;
2866
2867         /* RTF_PCPU is an internal flag; can not be set by userspace */
2868         if (cfg->fc_flags & RTF_PCPU) {
2869                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2870                 goto out;
2871         }
2872
2873         /* RTF_CACHE is an internal flag; can not be set by userspace */
2874         if (cfg->fc_flags & RTF_CACHE) {
2875                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2876                 goto out;
2877         }
2878
2879         if (cfg->fc_type > RTN_MAX) {
2880                 NL_SET_ERR_MSG(extack, "Invalid route type");
2881                 goto out;
2882         }
2883
2884         if (cfg->fc_dst_len > 128) {
2885                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2886                 goto out;
2887         }
2888         if (cfg->fc_src_len > 128) {
2889                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2890                 goto out;
2891         }
2892 #ifndef CONFIG_IPV6_SUBTREES
2893         if (cfg->fc_src_len) {
2894                 NL_SET_ERR_MSG(extack,
2895                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2896                 goto out;
2897         }
2898 #endif
2899         if (cfg->fc_ifindex) {
2900                 err = -ENODEV;
2901                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2902                 if (!dev)
2903                         goto out;
2904                 idev = in6_dev_get(dev);
2905                 if (!idev)
2906                         goto out;
2907         }
2908
2909         if (cfg->fc_metric == 0)
2910                 cfg->fc_metric = IP6_RT_PRIO_USER;
2911
2912         if (cfg->fc_flags & RTNH_F_ONLINK) {
2913                 if (!dev) {
2914                         NL_SET_ERR_MSG(extack,
2915                                        "Nexthop device required for onlink");
2916                         err = -ENODEV;
2917                         goto out;
2918                 }
2919
2920                 if (!(dev->flags & IFF_UP)) {
2921                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2922                         err = -ENETDOWN;
2923                         goto out;
2924                 }
2925         }
2926
2927         err = -ENOBUFS;
2928         if (cfg->fc_nlinfo.nlh &&
2929             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2930                 table = fib6_get_table(net, cfg->fc_table);
2931                 if (!table) {
2932                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2933                         table = fib6_new_table(net, cfg->fc_table);
2934                 }
2935         } else {
2936                 table = fib6_new_table(net, cfg->fc_table);
2937         }
2938
2939         if (!table)
2940                 goto out;
2941
2942         err = -ENOMEM;
2943         rt = fib6_info_alloc(gfp_flags);
2944         if (!rt)
2945                 goto out;
2946
2947         if (cfg->fc_flags & RTF_ADDRCONF)
2948                 rt->dst_nocount = true;
2949
2950         err = ip6_convert_metrics(net, rt, cfg);
2951         if (err < 0)
2952                 goto out;
2953
2954         if (cfg->fc_flags & RTF_EXPIRES)
2955                 fib6_set_expires(rt, jiffies +
2956                                 clock_t_to_jiffies(cfg->fc_expires));
2957         else
2958                 fib6_clean_expires(rt);
2959
2960         if (cfg->fc_protocol == RTPROT_UNSPEC)
2961                 cfg->fc_protocol = RTPROT_BOOT;
2962         rt->fib6_protocol = cfg->fc_protocol;
2963
2964         addr_type = ipv6_addr_type(&cfg->fc_dst);
2965
2966         if (cfg->fc_encap) {
2967                 struct lwtunnel_state *lwtstate;
2968
2969                 err = lwtunnel_build_state(cfg->fc_encap_type,
2970                                            cfg->fc_encap, AF_INET6, cfg,
2971                                            &lwtstate, extack);
2972                 if (err)
2973                         goto out;
2974                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2975         }
2976
2977         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2978         rt->fib6_dst.plen = cfg->fc_dst_len;
2979         if (rt->fib6_dst.plen == 128)
2980                 rt->dst_host = true;
2981
2982 #ifdef CONFIG_IPV6_SUBTREES
2983         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2984         rt->fib6_src.plen = cfg->fc_src_len;
2985 #endif
2986
2987         rt->fib6_metric = cfg->fc_metric;
2988         rt->fib6_nh.nh_weight = 1;
2989
2990         rt->fib6_type = cfg->fc_type;
2991
2992         /* We cannot add true routes via loopback here,
2993            they would result in kernel looping; promote them to reject routes
2994          */
2995         if ((cfg->fc_flags & RTF_REJECT) ||
2996             (dev && (dev->flags & IFF_LOOPBACK) &&
2997              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2998              !(cfg->fc_flags & RTF_LOCAL))) {
2999                 /* hold loopback dev/idev if we haven't done so. */
3000                 if (dev != net->loopback_dev) {
3001                         if (dev) {
3002                                 dev_put(dev);
3003                                 in6_dev_put(idev);
3004                         }
3005                         dev = net->loopback_dev;
3006                         dev_hold(dev);
3007                         idev = in6_dev_get(dev);
3008                         if (!idev) {
3009                                 err = -ENODEV;
3010                                 goto out;
3011                         }
3012                 }
3013                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3014                 goto install_route;
3015         }
3016
3017         if (cfg->fc_flags & RTF_GATEWAY) {
3018                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3019                 if (err)
3020                         goto out;
3021
3022                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3023         }
3024
3025         err = -ENODEV;
3026         if (!dev)
3027                 goto out;
3028
3029         if (idev->cnf.disable_ipv6) {
3030                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3031                 err = -EACCES;
3032                 goto out;
3033         }
3034
3035         if (!(dev->flags & IFF_UP)) {
3036                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3037                 err = -ENETDOWN;
3038                 goto out;
3039         }
3040
3041         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3042                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3043                         NL_SET_ERR_MSG(extack, "Invalid source address");
3044                         err = -EINVAL;
3045                         goto out;
3046                 }
3047                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3048                 rt->fib6_prefsrc.plen = 128;
3049         } else
3050                 rt->fib6_prefsrc.plen = 0;
3051
3052         rt->fib6_flags = cfg->fc_flags;
3053
3054 install_route:
3055         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3056             !netif_carrier_ok(dev))
3057                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3058         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3059         rt->fib6_nh.nh_dev = dev;
3060         rt->fib6_table = table;
3061
3062         cfg->fc_nlinfo.nl_net = dev_net(dev);
3063
3064         if (idev)
3065                 in6_dev_put(idev);
3066
3067         return rt;
3068 out:
3069         if (dev)
3070                 dev_put(dev);
3071         if (idev)
3072                 in6_dev_put(idev);
3073
3074         fib6_info_release(rt);
3075         return ERR_PTR(err);
3076 }
3077
3078 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3079                   struct netlink_ext_ack *extack)
3080 {
3081         struct fib6_info *rt;
3082         int err;
3083
3084         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3085         if (IS_ERR(rt))
3086                 return PTR_ERR(rt);
3087
3088         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3089         fib6_info_release(rt);
3090
3091         return err;
3092 }
3093
3094 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3095 {
3096         struct net *net = info->nl_net;
3097         struct fib6_table *table;
3098         int err;
3099
3100         if (rt == net->ipv6.fib6_null_entry) {
3101                 err = -ENOENT;
3102                 goto out;
3103         }
3104
3105         table = rt->fib6_table;
3106         spin_lock_bh(&table->tb6_lock);
3107         err = fib6_del(rt, info);
3108         spin_unlock_bh(&table->tb6_lock);
3109
3110 out:
3111         fib6_info_release(rt);
3112         return err;
3113 }
3114
3115 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3116 {
3117         struct nl_info info = { .nl_net = net };
3118
3119         return __ip6_del_rt(rt, &info);
3120 }
3121
3122 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3123 {
3124         struct nl_info *info = &cfg->fc_nlinfo;
3125         struct net *net = info->nl_net;
3126         struct sk_buff *skb = NULL;
3127         struct fib6_table *table;
3128         int err = -ENOENT;
3129
3130         if (rt == net->ipv6.fib6_null_entry)
3131                 goto out_put;
3132         table = rt->fib6_table;
3133         spin_lock_bh(&table->tb6_lock);
3134
3135         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3136                 struct fib6_info *sibling, *next_sibling;
3137
3138                 /* prefer to send a single notification with all hops */
3139                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3140                 if (skb) {
3141                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3142
3143                         if (rt6_fill_node(net, skb, rt, NULL,
3144                                           NULL, NULL, 0, RTM_DELROUTE,
3145                                           info->portid, seq, 0) < 0) {
3146                                 kfree_skb(skb);
3147                                 skb = NULL;
3148                         } else
3149                                 info->skip_notify = 1;
3150                 }
3151
3152                 list_for_each_entry_safe(sibling, next_sibling,
3153                                          &rt->fib6_siblings,
3154                                          fib6_siblings) {
3155                         err = fib6_del(sibling, info);
3156                         if (err)
3157                                 goto out_unlock;
3158                 }
3159         }
3160
3161         err = fib6_del(rt, info);
3162 out_unlock:
3163         spin_unlock_bh(&table->tb6_lock);
3164 out_put:
3165         fib6_info_release(rt);
3166
3167         if (skb) {
3168                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3169                             info->nlh, gfp_any());
3170         }
3171         return err;
3172 }
3173
3174 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3175 {
3176         int rc = -ESRCH;
3177
3178         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3179                 goto out;
3180
3181         if (cfg->fc_flags & RTF_GATEWAY &&
3182             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3183                 goto out;
3184         if (dst_hold_safe(&rt->dst))
3185                 rc = rt6_remove_exception_rt(rt);
3186 out:
3187         return rc;
3188 }
3189
3190 static int ip6_route_del(struct fib6_config *cfg,
3191                          struct netlink_ext_ack *extack)
3192 {
3193         struct rt6_info *rt_cache;
3194         struct fib6_table *table;
3195         struct fib6_info *rt;
3196         struct fib6_node *fn;
3197         int err = -ESRCH;
3198
3199         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3200         if (!table) {
3201                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3202                 return err;
3203         }
3204
3205         rcu_read_lock();
3206
3207         fn = fib6_locate(&table->tb6_root,
3208                          &cfg->fc_dst, cfg->fc_dst_len,
3209                          &cfg->fc_src, cfg->fc_src_len,
3210                          !(cfg->fc_flags & RTF_CACHE));
3211
3212         if (fn) {
3213                 for_each_fib6_node_rt_rcu(fn) {
3214                         if (cfg->fc_flags & RTF_CACHE) {
3215                                 int rc;
3216
3217                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3218                                                               &cfg->fc_src);
3219                                 if (rt_cache) {
3220                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3221                                         if (rc != -ESRCH)
3222                                                 return rc;
3223                                 }
3224                                 continue;
3225                         }
3226                         if (cfg->fc_ifindex &&
3227                             (!rt->fib6_nh.nh_dev ||
3228                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3229                                 continue;
3230                         if (cfg->fc_flags & RTF_GATEWAY &&
3231                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3232                                 continue;
3233                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3234                                 continue;
3235                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3236                                 continue;
3237                         fib6_info_hold(rt);
3238                         rcu_read_unlock();
3239
3240                         /* if gateway was specified only delete the one hop */
3241                         if (cfg->fc_flags & RTF_GATEWAY)
3242                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3243
3244                         return __ip6_del_rt_siblings(rt, cfg);
3245                 }
3246         }
3247         rcu_read_unlock();
3248
3249         return err;
3250 }
3251
3252 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3253 {
3254         struct netevent_redirect netevent;
3255         struct rt6_info *rt, *nrt = NULL;
3256         struct ndisc_options ndopts;
3257         struct inet6_dev *in6_dev;
3258         struct neighbour *neigh;
3259         struct fib6_info *from;
3260         struct rd_msg *msg;
3261         int optlen, on_link;
3262         u8 *lladdr;
3263
3264         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3265         optlen -= sizeof(*msg);
3266
3267         if (optlen < 0) {
3268                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3269                 return;
3270         }
3271
3272         msg = (struct rd_msg *)icmp6_hdr(skb);
3273
3274         if (ipv6_addr_is_multicast(&msg->dest)) {
3275                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3276                 return;
3277         }
3278
3279         on_link = 0;
3280         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3281                 on_link = 1;
3282         } else if (ipv6_addr_type(&msg->target) !=
3283                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3284                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3285                 return;
3286         }
3287
3288         in6_dev = __in6_dev_get(skb->dev);
3289         if (!in6_dev)
3290                 return;
3291         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3292                 return;
3293
3294         /* RFC2461 8.1:
3295          *      The IP source address of the Redirect MUST be the same as the current
3296          *      first-hop router for the specified ICMP Destination Address.
3297          */
3298
3299         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3300                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3301                 return;
3302         }
3303
3304         lladdr = NULL;
3305         if (ndopts.nd_opts_tgt_lladdr) {
3306                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3307                                              skb->dev);
3308                 if (!lladdr) {
3309                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3310                         return;
3311                 }
3312         }
3313
3314         rt = (struct rt6_info *) dst;
3315         if (rt->rt6i_flags & RTF_REJECT) {
3316                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3317                 return;
3318         }
3319
3320         /* Redirect received -> path was valid.
3321          * Look, redirects are sent only in response to data packets,
3322          * so that this nexthop apparently is reachable. --ANK
3323          */
3324         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3325
3326         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3327         if (!neigh)
3328                 return;
3329
3330         /*
3331          *      We have finally decided to accept it.
3332          */
3333
3334         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3335                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3336                      NEIGH_UPDATE_F_OVERRIDE|
3337                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3338                                      NEIGH_UPDATE_F_ISROUTER)),
3339                      NDISC_REDIRECT, &ndopts);
3340
3341         rcu_read_lock();
3342         from = rcu_dereference(rt->from);
3343         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3344         rcu_read_unlock();
3345         if (!nrt)
3346                 goto out;
3347
3348         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3349         if (on_link)
3350                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3351
3352         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3353
3354         /* No need to remove rt from the exception table if rt is
3355          * a cached route because rt6_insert_exception() will
3356          * takes care of it
3357          */
3358         if (rt6_insert_exception(nrt, rt->from)) {
3359                 dst_release_immediate(&nrt->dst);
3360                 goto out;
3361         }
3362
3363         netevent.old = &rt->dst;
3364         netevent.new = &nrt->dst;
3365         netevent.daddr = &msg->dest;
3366         netevent.neigh = neigh;
3367         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3368
3369 out:
3370         neigh_release(neigh);
3371 }
3372
3373 #ifdef CONFIG_IPV6_ROUTE_INFO
3374 static struct fib6_info *rt6_get_route_info(struct net *net,
3375                                            const struct in6_addr *prefix, int prefixlen,
3376                                            const struct in6_addr *gwaddr,
3377                                            struct net_device *dev)
3378 {
3379         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3380         int ifindex = dev->ifindex;
3381         struct fib6_node *fn;
3382         struct fib6_info *rt = NULL;
3383         struct fib6_table *table;
3384
3385         table = fib6_get_table(net, tb_id);
3386         if (!table)
3387                 return NULL;
3388
3389         rcu_read_lock();
3390         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3391         if (!fn)
3392                 goto out;
3393
3394         for_each_fib6_node_rt_rcu(fn) {
3395                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3396                         continue;
3397                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3398                         continue;
3399                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3400                         continue;
3401                 fib6_info_hold(rt);
3402                 break;
3403         }
3404 out:
3405         rcu_read_unlock();
3406         return rt;
3407 }
3408
3409 static struct fib6_info *rt6_add_route_info(struct net *net,
3410                                            const struct in6_addr *prefix, int prefixlen,
3411                                            const struct in6_addr *gwaddr,
3412                                            struct net_device *dev,
3413                                            unsigned int pref)
3414 {
3415         struct fib6_config cfg = {
3416                 .fc_metric      = IP6_RT_PRIO_USER,
3417                 .fc_ifindex     = dev->ifindex,
3418                 .fc_dst_len     = prefixlen,
3419                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3420                                   RTF_UP | RTF_PREF(pref),
3421                 .fc_protocol = RTPROT_RA,
3422                 .fc_type = RTN_UNICAST,
3423                 .fc_nlinfo.portid = 0,
3424                 .fc_nlinfo.nlh = NULL,
3425                 .fc_nlinfo.nl_net = net,
3426         };
3427
3428         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3429         cfg.fc_dst = *prefix;
3430         cfg.fc_gateway = *gwaddr;
3431
3432         /* We should treat it as a default route if prefix length is 0. */
3433         if (!prefixlen)
3434                 cfg.fc_flags |= RTF_DEFAULT;
3435
3436         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3437
3438         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3439 }
3440 #endif
3441
3442 struct fib6_info *rt6_get_dflt_router(struct net *net,
3443                                      const struct in6_addr *addr,
3444                                      struct net_device *dev)
3445 {
3446         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3447         struct fib6_info *rt;
3448         struct fib6_table *table;
3449
3450         table = fib6_get_table(net, tb_id);
3451         if (!table)
3452                 return NULL;
3453
3454         rcu_read_lock();
3455         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3456                 if (dev == rt->fib6_nh.nh_dev &&
3457                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3458                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3459                         break;
3460         }
3461         if (rt)
3462                 fib6_info_hold(rt);
3463         rcu_read_unlock();
3464         return rt;
3465 }
3466
3467 struct fib6_info *rt6_add_dflt_router(struct net *net,
3468                                      const struct in6_addr *gwaddr,
3469                                      struct net_device *dev,
3470                                      unsigned int pref)
3471 {
3472         struct fib6_config cfg = {
3473                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3474                 .fc_metric      = IP6_RT_PRIO_USER,
3475                 .fc_ifindex     = dev->ifindex,
3476                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3477                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3478                 .fc_protocol = RTPROT_RA,
3479                 .fc_type = RTN_UNICAST,
3480                 .fc_nlinfo.portid = 0,
3481                 .fc_nlinfo.nlh = NULL,
3482                 .fc_nlinfo.nl_net = net,
3483         };
3484
3485         cfg.fc_gateway = *gwaddr;
3486
3487         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3488                 struct fib6_table *table;
3489
3490                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3491                 if (table)
3492                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3493         }
3494
3495         return rt6_get_dflt_router(net, gwaddr, dev);
3496 }
3497
3498 static void __rt6_purge_dflt_routers(struct net *net,
3499                                      struct fib6_table *table)
3500 {
3501         struct fib6_info *rt;
3502
3503 restart:
3504         rcu_read_lock();
3505         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3506                 struct net_device *dev = fib6_info_nh_dev(rt);
3507                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3508
3509                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3510                     (!idev || idev->cnf.accept_ra != 2)) {
3511                         fib6_info_hold(rt);
3512                         rcu_read_unlock();
3513                         ip6_del_rt(net, rt);
3514                         goto restart;
3515                 }
3516         }
3517         rcu_read_unlock();
3518
3519         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3520 }
3521
3522 void rt6_purge_dflt_routers(struct net *net)
3523 {
3524         struct fib6_table *table;
3525         struct hlist_head *head;
3526         unsigned int h;
3527
3528         rcu_read_lock();
3529
3530         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3531                 head = &net->ipv6.fib_table_hash[h];
3532                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3533                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3534                                 __rt6_purge_dflt_routers(net, table);
3535                 }
3536         }
3537
3538         rcu_read_unlock();
3539 }
3540
3541 static void rtmsg_to_fib6_config(struct net *net,
3542                                  struct in6_rtmsg *rtmsg,
3543                                  struct fib6_config *cfg)
3544 {
3545         memset(cfg, 0, sizeof(*cfg));
3546
3547         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3548                          : RT6_TABLE_MAIN;
3549         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3550         cfg->fc_metric = rtmsg->rtmsg_metric;
3551         cfg->fc_expires = rtmsg->rtmsg_info;
3552         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3553         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3554         cfg->fc_flags = rtmsg->rtmsg_flags;
3555         cfg->fc_type = rtmsg->rtmsg_type;
3556
3557         cfg->fc_nlinfo.nl_net = net;
3558
3559         cfg->fc_dst = rtmsg->rtmsg_dst;
3560         cfg->fc_src = rtmsg->rtmsg_src;
3561         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3562 }
3563
3564 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3565 {
3566         struct fib6_config cfg;
3567         struct in6_rtmsg rtmsg;
3568         int err;
3569
3570         switch (cmd) {
3571         case SIOCADDRT:         /* Add a route */
3572         case SIOCDELRT:         /* Delete a route */
3573                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3574                         return -EPERM;
3575                 err = copy_from_user(&rtmsg, arg,
3576                                      sizeof(struct in6_rtmsg));
3577                 if (err)
3578                         return -EFAULT;
3579
3580                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3581
3582                 rtnl_lock();
3583                 switch (cmd) {
3584                 case SIOCADDRT:
3585                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3586                         break;
3587                 case SIOCDELRT:
3588                         err = ip6_route_del(&cfg, NULL);
3589                         break;
3590                 default:
3591                         err = -EINVAL;
3592                 }
3593                 rtnl_unlock();
3594
3595                 return err;
3596         }
3597
3598         return -EINVAL;
3599 }
3600
3601 /*
3602  *      Drop the packet on the floor
3603  */
3604
3605 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3606 {
3607         int type;
3608         struct dst_entry *dst = skb_dst(skb);
3609         switch (ipstats_mib_noroutes) {
3610         case IPSTATS_MIB_INNOROUTES:
3611                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3612                 if (type == IPV6_ADDR_ANY) {
3613                         IP6_INC_STATS(dev_net(dst->dev),
3614                                       __in6_dev_get_safely(skb->dev),
3615                                       IPSTATS_MIB_INADDRERRORS);
3616                         break;
3617                 }
3618                 /* FALLTHROUGH */
3619         case IPSTATS_MIB_OUTNOROUTES:
3620                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3621                               ipstats_mib_noroutes);
3622                 break;
3623         }
3624         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3625         kfree_skb(skb);
3626         return 0;
3627 }
3628
3629 static int ip6_pkt_discard(struct sk_buff *skb)
3630 {
3631         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3632 }
3633
3634 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3635 {
3636         skb->dev = skb_dst(skb)->dev;
3637         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3638 }
3639
3640 static int ip6_pkt_prohibit(struct sk_buff *skb)
3641 {
3642         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3643 }
3644
3645 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3646 {
3647         skb->dev = skb_dst(skb)->dev;
3648         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3649 }
3650
3651 /*
3652  *      Allocate a dst for local (unicast / anycast) address.
3653  */
3654
3655 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3656                                      struct inet6_dev *idev,
3657                                      const struct in6_addr *addr,
3658                                      bool anycast, gfp_t gfp_flags)
3659 {
3660         u32 tb_id;
3661         struct net_device *dev = idev->dev;
3662         struct fib6_info *f6i;
3663
3664         f6i = fib6_info_alloc(gfp_flags);
3665         if (!f6i)
3666                 return ERR_PTR(-ENOMEM);
3667
3668         f6i->dst_nocount = true;
3669         f6i->dst_host = true;
3670         f6i->fib6_protocol = RTPROT_KERNEL;
3671         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3672         if (anycast) {
3673                 f6i->fib6_type = RTN_ANYCAST;
3674                 f6i->fib6_flags |= RTF_ANYCAST;
3675         } else {
3676                 f6i->fib6_type = RTN_LOCAL;
3677                 f6i->fib6_flags |= RTF_LOCAL;
3678         }
3679
3680         f6i->fib6_nh.nh_gw = *addr;
3681         dev_hold(dev);
3682         f6i->fib6_nh.nh_dev = dev;
3683         f6i->fib6_dst.addr = *addr;
3684         f6i->fib6_dst.plen = 128;
3685         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3686         f6i->fib6_table = fib6_get_table(net, tb_id);
3687
3688         return f6i;
3689 }
3690
3691 /* remove deleted ip from prefsrc entries */
3692 struct arg_dev_net_ip {
3693         struct net_device *dev;
3694         struct net *net;
3695         struct in6_addr *addr;
3696 };
3697
3698 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3699 {
3700         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3701         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3702         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3703
3704         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3705             rt != net->ipv6.fib6_null_entry &&
3706             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3707                 spin_lock_bh(&rt6_exception_lock);
3708                 /* remove prefsrc entry */
3709                 rt->fib6_prefsrc.plen = 0;
3710                 /* need to update cache as well */
3711                 rt6_exceptions_remove_prefsrc(rt);
3712                 spin_unlock_bh(&rt6_exception_lock);
3713         }
3714         return 0;
3715 }
3716
3717 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3718 {
3719         struct net *net = dev_net(ifp->idev->dev);
3720         struct arg_dev_net_ip adni = {
3721                 .dev = ifp->idev->dev,
3722                 .net = net,
3723                 .addr = &ifp->addr,
3724         };
3725         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3726 }
3727
3728 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3729
3730 /* Remove routers and update dst entries when gateway turn into host. */
3731 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3732 {
3733         struct in6_addr *gateway = (struct in6_addr *)arg;
3734
3735         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3736             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3737                 return -1;
3738         }
3739
3740         /* Further clean up cached routes in exception table.
3741          * This is needed because cached route may have a different
3742          * gateway than its 'parent' in the case of an ip redirect.
3743          */
3744         rt6_exceptions_clean_tohost(rt, gateway);
3745
3746         return 0;
3747 }
3748
3749 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3750 {
3751         fib6_clean_all(net, fib6_clean_tohost, gateway);
3752 }
3753
3754 struct arg_netdev_event {
3755         const struct net_device *dev;
3756         union {
3757                 unsigned int nh_flags;
3758                 unsigned long event;
3759         };
3760 };
3761
3762 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3763 {
3764         struct fib6_info *iter;
3765         struct fib6_node *fn;
3766
3767         fn = rcu_dereference_protected(rt->fib6_node,
3768                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3769         iter = rcu_dereference_protected(fn->leaf,
3770                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3771         while (iter) {
3772                 if (iter->fib6_metric == rt->fib6_metric &&
3773                     rt6_qualify_for_ecmp(iter))
3774                         return iter;
3775                 iter = rcu_dereference_protected(iter->rt6_next,
3776                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3777         }
3778
3779         return NULL;
3780 }
3781
3782 static bool rt6_is_dead(const struct fib6_info *rt)
3783 {
3784         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3785             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3786              fib6_ignore_linkdown(rt)))
3787                 return true;
3788
3789         return false;
3790 }
3791
3792 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3793 {
3794         struct fib6_info *iter;
3795         int total = 0;
3796
3797         if (!rt6_is_dead(rt))
3798                 total += rt->fib6_nh.nh_weight;
3799
3800         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3801                 if (!rt6_is_dead(iter))
3802                         total += iter->fib6_nh.nh_weight;
3803         }
3804
3805         return total;
3806 }
3807
3808 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3809 {
3810         int upper_bound = -1;
3811
3812         if (!rt6_is_dead(rt)) {
3813                 *weight += rt->fib6_nh.nh_weight;
3814                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3815                                                     total) - 1;
3816         }
3817         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3818 }
3819
3820 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3821 {
3822         struct fib6_info *iter;
3823         int weight = 0;
3824
3825         rt6_upper_bound_set(rt, &weight, total);
3826
3827         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3828                 rt6_upper_bound_set(iter, &weight, total);
3829 }
3830
3831 void rt6_multipath_rebalance(struct fib6_info *rt)
3832 {
3833         struct fib6_info *first;
3834         int total;
3835
3836         /* In case the entire multipath route was marked for flushing,
3837          * then there is no need to rebalance upon the removal of every
3838          * sibling route.
3839          */
3840         if (!rt->fib6_nsiblings || rt->should_flush)
3841                 return;
3842
3843         /* During lookup routes are evaluated in order, so we need to
3844          * make sure upper bounds are assigned from the first sibling
3845          * onwards.
3846          */
3847         first = rt6_multipath_first_sibling(rt);
3848         if (WARN_ON_ONCE(!first))
3849                 return;
3850
3851         total = rt6_multipath_total_weight(first);
3852         rt6_multipath_upper_bound_set(first, total);
3853 }
3854
3855 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3856 {
3857         const struct arg_netdev_event *arg = p_arg;
3858         struct net *net = dev_net(arg->dev);
3859
3860         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3861                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3862                 fib6_update_sernum_upto_root(net, rt);
3863                 rt6_multipath_rebalance(rt);
3864         }
3865
3866         return 0;
3867 }
3868
3869 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3870 {
3871         struct arg_netdev_event arg = {
3872                 .dev = dev,
3873                 {
3874                         .nh_flags = nh_flags,
3875                 },
3876         };
3877
3878         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3879                 arg.nh_flags |= RTNH_F_LINKDOWN;
3880
3881         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3882 }
3883
3884 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3885                                    const struct net_device *dev)
3886 {
3887         struct fib6_info *iter;
3888
3889         if (rt->fib6_nh.nh_dev == dev)
3890                 return true;
3891         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3892                 if (iter->fib6_nh.nh_dev == dev)
3893                         return true;
3894
3895         return false;
3896 }
3897
3898 static void rt6_multipath_flush(struct fib6_info *rt)
3899 {
3900         struct fib6_info *iter;
3901
3902         rt->should_flush = 1;
3903         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3904                 iter->should_flush = 1;
3905 }
3906
3907 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3908                                              const struct net_device *down_dev)
3909 {
3910         struct fib6_info *iter;
3911         unsigned int dead = 0;
3912
3913         if (rt->fib6_nh.nh_dev == down_dev ||
3914             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3915                 dead++;
3916         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3917                 if (iter->fib6_nh.nh_dev == down_dev ||
3918                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3919                         dead++;
3920
3921         return dead;
3922 }
3923
3924 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3925                                        const struct net_device *dev,
3926                                        unsigned int nh_flags)
3927 {
3928         struct fib6_info *iter;
3929
3930         if (rt->fib6_nh.nh_dev == dev)
3931                 rt->fib6_nh.nh_flags |= nh_flags;
3932         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3933                 if (iter->fib6_nh.nh_dev == dev)
3934                         iter->fib6_nh.nh_flags |= nh_flags;
3935 }
3936
3937 /* called with write lock held for table with rt */
3938 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3939 {
3940         const struct arg_netdev_event *arg = p_arg;
3941         const struct net_device *dev = arg->dev;
3942         struct net *net = dev_net(dev);
3943
3944         if (rt == net->ipv6.fib6_null_entry)
3945                 return 0;
3946
3947         switch (arg->event) {
3948         case NETDEV_UNREGISTER:
3949                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3950         case NETDEV_DOWN:
3951                 if (rt->should_flush)
3952                         return -1;
3953                 if (!rt->fib6_nsiblings)
3954                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3955                 if (rt6_multipath_uses_dev(rt, dev)) {
3956                         unsigned int count;
3957
3958                         count = rt6_multipath_dead_count(rt, dev);
3959                         if (rt->fib6_nsiblings + 1 == count) {
3960                                 rt6_multipath_flush(rt);
3961                                 return -1;
3962                         }
3963                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3964                                                    RTNH_F_LINKDOWN);
3965                         fib6_update_sernum(net, rt);
3966                         rt6_multipath_rebalance(rt);
3967                 }
3968                 return -2;
3969         case NETDEV_CHANGE:
3970                 if (rt->fib6_nh.nh_dev != dev ||
3971                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3972                         break;
3973                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3974                 rt6_multipath_rebalance(rt);
3975                 break;
3976         }
3977
3978         return 0;
3979 }
3980
3981 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3982 {
3983         struct arg_netdev_event arg = {
3984                 .dev = dev,
3985                 {
3986                         .event = event,
3987                 },
3988         };
3989
3990         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3991 }
3992
3993 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3994 {
3995         rt6_sync_down_dev(dev, event);
3996         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3997         neigh_ifdown(&nd_tbl, dev);
3998 }
3999
4000 struct rt6_mtu_change_arg {
4001         struct net_device *dev;
4002         unsigned int mtu;
4003 };
4004
4005 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4006 {
4007         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4008         struct inet6_dev *idev;
4009
4010         /* In IPv6 pmtu discovery is not optional,
4011            so that RTAX_MTU lock cannot disable it.
4012            We still use this lock to block changes
4013            caused by addrconf/ndisc.
4014         */
4015
4016         idev = __in6_dev_get(arg->dev);
4017         if (!idev)
4018                 return 0;
4019
4020         /* For administrative MTU increase, there is no way to discover
4021            IPv6 PMTU increase, so PMTU increase should be updated here.
4022            Since RFC 1981 doesn't include administrative MTU increase
4023            update PMTU increase is a MUST. (i.e. jumbo frame)
4024          */
4025         if (rt->fib6_nh.nh_dev == arg->dev &&
4026             !fib6_metric_locked(rt, RTAX_MTU)) {
4027                 u32 mtu = rt->fib6_pmtu;
4028
4029                 if (mtu >= arg->mtu ||
4030                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4031                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4032
4033                 spin_lock_bh(&rt6_exception_lock);
4034                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4035                 spin_unlock_bh(&rt6_exception_lock);
4036         }
4037         return 0;
4038 }
4039
4040 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4041 {
4042         struct rt6_mtu_change_arg arg = {
4043                 .dev = dev,
4044                 .mtu = mtu,
4045         };
4046
4047         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4048 }
4049
4050 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4051         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4052         [RTA_OIF]               = { .type = NLA_U32 },
4053         [RTA_IIF]               = { .type = NLA_U32 },
4054         [RTA_PRIORITY]          = { .type = NLA_U32 },
4055         [RTA_METRICS]           = { .type = NLA_NESTED },
4056         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4057         [RTA_PREF]              = { .type = NLA_U8 },
4058         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4059         [RTA_ENCAP]             = { .type = NLA_NESTED },
4060         [RTA_EXPIRES]           = { .type = NLA_U32 },
4061         [RTA_UID]               = { .type = NLA_U32 },
4062         [RTA_MARK]              = { .type = NLA_U32 },
4063 };
4064
4065 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4066                               struct fib6_config *cfg,
4067                               struct netlink_ext_ack *extack)
4068 {
4069         struct rtmsg *rtm;
4070         struct nlattr *tb[RTA_MAX+1];
4071         unsigned int pref;
4072         int err;
4073
4074         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4075                           NULL);
4076         if (err < 0)
4077                 goto errout;
4078
4079         err = -EINVAL;
4080         rtm = nlmsg_data(nlh);
4081         memset(cfg, 0, sizeof(*cfg));
4082
4083         cfg->fc_table = rtm->rtm_table;
4084         cfg->fc_dst_len = rtm->rtm_dst_len;
4085         cfg->fc_src_len = rtm->rtm_src_len;
4086         cfg->fc_flags = RTF_UP;
4087         cfg->fc_protocol = rtm->rtm_protocol;
4088         cfg->fc_type = rtm->rtm_type;
4089
4090         if (rtm->rtm_type == RTN_UNREACHABLE ||
4091             rtm->rtm_type == RTN_BLACKHOLE ||
4092             rtm->rtm_type == RTN_PROHIBIT ||
4093             rtm->rtm_type == RTN_THROW)
4094                 cfg->fc_flags |= RTF_REJECT;
4095
4096         if (rtm->rtm_type == RTN_LOCAL)
4097                 cfg->fc_flags |= RTF_LOCAL;
4098
4099         if (rtm->rtm_flags & RTM_F_CLONED)
4100                 cfg->fc_flags |= RTF_CACHE;
4101
4102         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4103
4104         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4105         cfg->fc_nlinfo.nlh = nlh;
4106         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4107
4108         if (tb[RTA_GATEWAY]) {
4109                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4110                 cfg->fc_flags |= RTF_GATEWAY;
4111         }
4112
4113         if (tb[RTA_DST]) {
4114                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4115
4116                 if (nla_len(tb[RTA_DST]) < plen)
4117                         goto errout;
4118
4119                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4120         }
4121
4122         if (tb[RTA_SRC]) {
4123                 int plen = (rtm->rtm_src_len + 7) >> 3;
4124
4125                 if (nla_len(tb[RTA_SRC]) < plen)
4126                         goto errout;
4127
4128                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4129         }
4130
4131         if (tb[RTA_PREFSRC])
4132                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4133
4134         if (tb[RTA_OIF])
4135                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4136
4137         if (tb[RTA_PRIORITY])
4138                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4139
4140         if (tb[RTA_METRICS]) {
4141                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4142                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4143         }
4144
4145         if (tb[RTA_TABLE])
4146                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4147
4148         if (tb[RTA_MULTIPATH]) {
4149                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4150                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4151
4152                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4153                                                      cfg->fc_mp_len, extack);
4154                 if (err < 0)
4155                         goto errout;
4156         }
4157
4158         if (tb[RTA_PREF]) {
4159                 pref = nla_get_u8(tb[RTA_PREF]);
4160                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4161                     pref != ICMPV6_ROUTER_PREF_HIGH)
4162                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4163                 cfg->fc_flags |= RTF_PREF(pref);
4164         }
4165
4166         if (tb[RTA_ENCAP])
4167                 cfg->fc_encap = tb[RTA_ENCAP];
4168
4169         if (tb[RTA_ENCAP_TYPE]) {
4170                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4171
4172                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4173                 if (err < 0)
4174                         goto errout;
4175         }
4176
4177         if (tb[RTA_EXPIRES]) {
4178                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4179
4180                 if (addrconf_finite_timeout(timeout)) {
4181                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4182                         cfg->fc_flags |= RTF_EXPIRES;
4183                 }
4184         }
4185
4186         err = 0;
4187 errout:
4188         return err;
4189 }
4190
4191 struct rt6_nh {
4192         struct fib6_info *fib6_info;
4193         struct fib6_config r_cfg;
4194         struct list_head next;
4195 };
4196
4197 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4198 {
4199         struct rt6_nh *nh;
4200
4201         list_for_each_entry(nh, rt6_nh_list, next) {
4202                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4203                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4204                         nh->r_cfg.fc_ifindex);
4205         }
4206 }
4207
4208 static int ip6_route_info_append(struct net *net,
4209                                  struct list_head *rt6_nh_list,
4210                                  struct fib6_info *rt,
4211                                  struct fib6_config *r_cfg)
4212 {
4213         struct rt6_nh *nh;
4214         int err = -EEXIST;
4215
4216         list_for_each_entry(nh, rt6_nh_list, next) {
4217                 /* check if fib6_info already exists */
4218                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4219                         return err;
4220         }
4221
4222         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4223         if (!nh)
4224                 return -ENOMEM;
4225         nh->fib6_info = rt;
4226         err = ip6_convert_metrics(net, rt, r_cfg);
4227         if (err) {
4228                 kfree(nh);
4229                 return err;
4230         }
4231         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4232         list_add_tail(&nh->next, rt6_nh_list);
4233
4234         return 0;
4235 }
4236
4237 static void ip6_route_mpath_notify(struct fib6_info *rt,
4238                                    struct fib6_info *rt_last,
4239                                    struct nl_info *info,
4240                                    __u16 nlflags)
4241 {
4242         /* if this is an APPEND route, then rt points to the first route
4243          * inserted and rt_last points to last route inserted. Userspace
4244          * wants a consistent dump of the route which starts at the first
4245          * nexthop. Since sibling routes are always added at the end of
4246          * the list, find the first sibling of the last route appended
4247          */
4248         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4249                 rt = list_first_entry(&rt_last->fib6_siblings,
4250                                       struct fib6_info,
4251                                       fib6_siblings);
4252         }
4253
4254         if (rt)
4255                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4256 }
4257
4258 static int ip6_route_multipath_add(struct fib6_config *cfg,
4259                                    struct netlink_ext_ack *extack)
4260 {
4261         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4262         struct nl_info *info = &cfg->fc_nlinfo;
4263         struct fib6_config r_cfg;
4264         struct rtnexthop *rtnh;
4265         struct fib6_info *rt;
4266         struct rt6_nh *err_nh;
4267         struct rt6_nh *nh, *nh_safe;
4268         __u16 nlflags;
4269         int remaining;
4270         int attrlen;
4271         int err = 1;
4272         int nhn = 0;
4273         int replace = (cfg->fc_nlinfo.nlh &&
4274                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4275         LIST_HEAD(rt6_nh_list);
4276
4277         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4278         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4279                 nlflags |= NLM_F_APPEND;
4280
4281         remaining = cfg->fc_mp_len;
4282         rtnh = (struct rtnexthop *)cfg->fc_mp;
4283
4284         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4285          * fib6_info structs per nexthop
4286          */
4287         while (rtnh_ok(rtnh, remaining)) {
4288                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4289                 if (rtnh->rtnh_ifindex)
4290                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4291
4292                 attrlen = rtnh_attrlen(rtnh);
4293                 if (attrlen > 0) {
4294                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4295
4296                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4297                         if (nla) {
4298                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4299                                 r_cfg.fc_flags |= RTF_GATEWAY;
4300                         }
4301                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4302                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4303                         if (nla)
4304                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4305                 }
4306
4307                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4308                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4309                 if (IS_ERR(rt)) {
4310                         err = PTR_ERR(rt);
4311                         rt = NULL;
4312                         goto cleanup;
4313                 }
4314
4315                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4316
4317                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4318                                             rt, &r_cfg);
4319                 if (err) {
4320                         fib6_info_release(rt);
4321                         goto cleanup;
4322                 }
4323
4324                 rtnh = rtnh_next(rtnh, &remaining);
4325         }
4326
4327         /* for add and replace send one notification with all nexthops.
4328          * Skip the notification in fib6_add_rt2node and send one with
4329          * the full route when done
4330          */
4331         info->skip_notify = 1;
4332
4333         err_nh = NULL;
4334         list_for_each_entry(nh, &rt6_nh_list, next) {
4335                 rt_last = nh->fib6_info;
4336                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4337                 fib6_info_release(nh->fib6_info);
4338
4339                 /* save reference to first route for notification */
4340                 if (!rt_notif && !err)
4341                         rt_notif = nh->fib6_info;
4342
4343                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4344                 nh->fib6_info = NULL;
4345                 if (err) {
4346                         if (replace && nhn)
4347                                 ip6_print_replace_route_err(&rt6_nh_list);
4348                         err_nh = nh;
4349                         goto add_errout;
4350                 }
4351
4352                 /* Because each route is added like a single route we remove
4353                  * these flags after the first nexthop: if there is a collision,
4354                  * we have already failed to add the first nexthop:
4355                  * fib6_add_rt2node() has rejected it; when replacing, old
4356                  * nexthops have been replaced by first new, the rest should
4357                  * be added to it.
4358                  */
4359                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4360                                                      NLM_F_REPLACE);
4361                 nhn++;
4362         }
4363
4364         /* success ... tell user about new route */
4365         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4366         goto cleanup;
4367
4368 add_errout:
4369         /* send notification for routes that were added so that
4370          * the delete notifications sent by ip6_route_del are
4371          * coherent
4372          */
4373         if (rt_notif)
4374                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4375
4376         /* Delete routes that were already added */
4377         list_for_each_entry(nh, &rt6_nh_list, next) {
4378                 if (err_nh == nh)
4379                         break;
4380                 ip6_route_del(&nh->r_cfg, extack);
4381         }
4382
4383 cleanup:
4384         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4385                 if (nh->fib6_info)
4386                         fib6_info_release(nh->fib6_info);
4387                 list_del(&nh->next);
4388                 kfree(nh);
4389         }
4390
4391         return err;
4392 }
4393
4394 static int ip6_route_multipath_del(struct fib6_config *cfg,
4395                                    struct netlink_ext_ack *extack)
4396 {
4397         struct fib6_config r_cfg;
4398         struct rtnexthop *rtnh;
4399         int remaining;
4400         int attrlen;
4401         int err = 1, last_err = 0;
4402
4403         remaining = cfg->fc_mp_len;
4404         rtnh = (struct rtnexthop *)cfg->fc_mp;
4405
4406         /* Parse a Multipath Entry */
4407         while (rtnh_ok(rtnh, remaining)) {
4408                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4409                 if (rtnh->rtnh_ifindex)
4410                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4411
4412                 attrlen = rtnh_attrlen(rtnh);
4413                 if (attrlen > 0) {
4414                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4415
4416                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4417                         if (nla) {
4418                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4419                                 r_cfg.fc_flags |= RTF_GATEWAY;
4420                         }
4421                 }
4422                 err = ip6_route_del(&r_cfg, extack);
4423                 if (err)
4424                         last_err = err;
4425
4426                 rtnh = rtnh_next(rtnh, &remaining);
4427         }
4428
4429         return last_err;
4430 }
4431
4432 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4433                               struct netlink_ext_ack *extack)
4434 {
4435         struct fib6_config cfg;
4436         int err;
4437
4438         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4439         if (err < 0)
4440                 return err;
4441
4442         if (cfg.fc_mp)
4443                 return ip6_route_multipath_del(&cfg, extack);
4444         else {
4445                 cfg.fc_delete_all_nh = 1;
4446                 return ip6_route_del(&cfg, extack);
4447         }
4448 }
4449
4450 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4451                               struct netlink_ext_ack *extack)
4452 {
4453         struct fib6_config cfg;
4454         int err;
4455
4456         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4457         if (err < 0)
4458                 return err;
4459
4460         if (cfg.fc_mp)
4461                 return ip6_route_multipath_add(&cfg, extack);
4462         else
4463                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4464 }
4465
4466 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4467 {
4468         int nexthop_len = 0;
4469
4470         if (rt->fib6_nsiblings) {
4471                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4472                             + NLA_ALIGN(sizeof(struct rtnexthop))
4473                             + nla_total_size(16) /* RTA_GATEWAY */
4474                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4475
4476                 nexthop_len *= rt->fib6_nsiblings;
4477         }
4478
4479         return NLMSG_ALIGN(sizeof(struct rtmsg))
4480                + nla_total_size(16) /* RTA_SRC */
4481                + nla_total_size(16) /* RTA_DST */
4482                + nla_total_size(16) /* RTA_GATEWAY */
4483                + nla_total_size(16) /* RTA_PREFSRC */
4484                + nla_total_size(4) /* RTA_TABLE */
4485                + nla_total_size(4) /* RTA_IIF */
4486                + nla_total_size(4) /* RTA_OIF */
4487                + nla_total_size(4) /* RTA_PRIORITY */
4488                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4489                + nla_total_size(sizeof(struct rta_cacheinfo))
4490                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4491                + nla_total_size(1) /* RTA_PREF */
4492                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4493                + nexthop_len;
4494 }
4495
4496 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4497                             unsigned int *flags, bool skip_oif)
4498 {
4499         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4500                 *flags |= RTNH_F_DEAD;
4501
4502         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4503                 *flags |= RTNH_F_LINKDOWN;
4504
4505                 rcu_read_lock();
4506                 if (fib6_ignore_linkdown(rt))
4507                         *flags |= RTNH_F_DEAD;
4508                 rcu_read_unlock();
4509         }
4510
4511         if (rt->fib6_flags & RTF_GATEWAY) {
4512                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4513                         goto nla_put_failure;
4514         }
4515
4516         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4517         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4518                 *flags |= RTNH_F_OFFLOAD;
4519
4520         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4521         if (!skip_oif && rt->fib6_nh.nh_dev &&
4522             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4523                 goto nla_put_failure;
4524
4525         if (rt->fib6_nh.nh_lwtstate &&
4526             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4527                 goto nla_put_failure;
4528
4529         return 0;
4530
4531 nla_put_failure:
4532         return -EMSGSIZE;
4533 }
4534
4535 /* add multipath next hop */
4536 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4537 {
4538         const struct net_device *dev = rt->fib6_nh.nh_dev;
4539         struct rtnexthop *rtnh;
4540         unsigned int flags = 0;
4541
4542         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4543         if (!rtnh)
4544                 goto nla_put_failure;
4545
4546         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4547         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4548
4549         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4550                 goto nla_put_failure;
4551
4552         rtnh->rtnh_flags = flags;
4553
4554         /* length of rtnetlink header + attributes */
4555         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4556
4557         return 0;
4558
4559 nla_put_failure:
4560         return -EMSGSIZE;
4561 }
4562
4563 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4564                          struct fib6_info *rt, struct dst_entry *dst,
4565                          struct in6_addr *dest, struct in6_addr *src,
4566                          int iif, int type, u32 portid, u32 seq,
4567                          unsigned int flags)
4568 {
4569         struct rtmsg *rtm;
4570         struct nlmsghdr *nlh;
4571         long expires = 0;
4572         u32 *pmetrics;
4573         u32 table;
4574
4575         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4576         if (!nlh)
4577                 return -EMSGSIZE;
4578
4579         rtm = nlmsg_data(nlh);
4580         rtm->rtm_family = AF_INET6;
4581         rtm->rtm_dst_len = rt->fib6_dst.plen;
4582         rtm->rtm_src_len = rt->fib6_src.plen;
4583         rtm->rtm_tos = 0;
4584         if (rt->fib6_table)
4585                 table = rt->fib6_table->tb6_id;
4586         else
4587                 table = RT6_TABLE_UNSPEC;
4588         rtm->rtm_table = table;
4589         if (nla_put_u32(skb, RTA_TABLE, table))
4590                 goto nla_put_failure;
4591
4592         rtm->rtm_type = rt->fib6_type;
4593         rtm->rtm_flags = 0;
4594         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4595         rtm->rtm_protocol = rt->fib6_protocol;
4596
4597         if (rt->fib6_flags & RTF_CACHE)
4598                 rtm->rtm_flags |= RTM_F_CLONED;
4599
4600         if (dest) {
4601                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4602                         goto nla_put_failure;
4603                 rtm->rtm_dst_len = 128;
4604         } else if (rtm->rtm_dst_len)
4605                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4606                         goto nla_put_failure;
4607 #ifdef CONFIG_IPV6_SUBTREES
4608         if (src) {
4609                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4610                         goto nla_put_failure;
4611                 rtm->rtm_src_len = 128;
4612         } else if (rtm->rtm_src_len &&
4613                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4614                 goto nla_put_failure;
4615 #endif
4616         if (iif) {
4617 #ifdef CONFIG_IPV6_MROUTE
4618                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4619                         int err = ip6mr_get_route(net, skb, rtm, portid);
4620
4621                         if (err == 0)
4622                                 return 0;
4623                         if (err < 0)
4624                                 goto nla_put_failure;
4625                 } else
4626 #endif
4627                         if (nla_put_u32(skb, RTA_IIF, iif))
4628                                 goto nla_put_failure;
4629         } else if (dest) {
4630                 struct in6_addr saddr_buf;
4631                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4632                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4633                         goto nla_put_failure;
4634         }
4635
4636         if (rt->fib6_prefsrc.plen) {
4637                 struct in6_addr saddr_buf;
4638                 saddr_buf = rt->fib6_prefsrc.addr;
4639                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4640                         goto nla_put_failure;
4641         }
4642
4643         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4644         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4645                 goto nla_put_failure;
4646
4647         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4648                 goto nla_put_failure;
4649
4650         /* For multipath routes, walk the siblings list and add
4651          * each as a nexthop within RTA_MULTIPATH.
4652          */
4653         if (rt->fib6_nsiblings) {
4654                 struct fib6_info *sibling, *next_sibling;
4655                 struct nlattr *mp;
4656
4657                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4658                 if (!mp)
4659                         goto nla_put_failure;
4660
4661                 if (rt6_add_nexthop(skb, rt) < 0)
4662                         goto nla_put_failure;
4663
4664                 list_for_each_entry_safe(sibling, next_sibling,
4665                                          &rt->fib6_siblings, fib6_siblings) {
4666                         if (rt6_add_nexthop(skb, sibling) < 0)
4667                                 goto nla_put_failure;
4668                 }
4669
4670                 nla_nest_end(skb, mp);
4671         } else {
4672                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4673                         goto nla_put_failure;
4674         }
4675
4676         if (rt->fib6_flags & RTF_EXPIRES) {
4677                 expires = dst ? dst->expires : rt->expires;
4678                 expires -= jiffies;
4679         }
4680
4681         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4682                 goto nla_put_failure;
4683
4684         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4685                 goto nla_put_failure;
4686
4687
4688         nlmsg_end(skb, nlh);
4689         return 0;
4690
4691 nla_put_failure:
4692         nlmsg_cancel(skb, nlh);
4693         return -EMSGSIZE;
4694 }
4695
4696 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4697 {
4698         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4699         struct net *net = arg->net;
4700
4701         if (rt == net->ipv6.fib6_null_entry)
4702                 return 0;
4703
4704         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4705                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4706
4707                 /* user wants prefix routes only */
4708                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4709                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4710                         /* success since this is not a prefix route */
4711                         return 1;
4712                 }
4713         }
4714
4715         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4716                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4717                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4718 }
4719
4720 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4721                               struct netlink_ext_ack *extack)
4722 {
4723         struct net *net = sock_net(in_skb->sk);
4724         struct nlattr *tb[RTA_MAX+1];
4725         int err, iif = 0, oif = 0;
4726         struct fib6_info *from;
4727         struct dst_entry *dst;
4728         struct rt6_info *rt;
4729         struct sk_buff *skb;
4730         struct rtmsg *rtm;
4731         struct flowi6 fl6;
4732         bool fibmatch;
4733
4734         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4735                           extack);
4736         if (err < 0)
4737                 goto errout;
4738
4739         err = -EINVAL;
4740         memset(&fl6, 0, sizeof(fl6));
4741         rtm = nlmsg_data(nlh);
4742         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4743         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4744
4745         if (tb[RTA_SRC]) {
4746                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4747                         goto errout;
4748
4749                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4750         }
4751
4752         if (tb[RTA_DST]) {
4753                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4754                         goto errout;
4755
4756                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4757         }
4758
4759         if (tb[RTA_IIF])
4760                 iif = nla_get_u32(tb[RTA_IIF]);
4761
4762         if (tb[RTA_OIF])
4763                 oif = nla_get_u32(tb[RTA_OIF]);
4764
4765         if (tb[RTA_MARK])
4766                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4767
4768         if (tb[RTA_UID])
4769                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4770                                            nla_get_u32(tb[RTA_UID]));
4771         else
4772                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4773
4774         if (iif) {
4775                 struct net_device *dev;
4776                 int flags = 0;
4777
4778                 rcu_read_lock();
4779
4780                 dev = dev_get_by_index_rcu(net, iif);
4781                 if (!dev) {
4782                         rcu_read_unlock();
4783                         err = -ENODEV;
4784                         goto errout;
4785                 }
4786
4787                 fl6.flowi6_iif = iif;
4788
4789                 if (!ipv6_addr_any(&fl6.saddr))
4790                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4791
4792                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4793
4794                 rcu_read_unlock();
4795         } else {
4796                 fl6.flowi6_oif = oif;
4797
4798                 dst = ip6_route_output(net, NULL, &fl6);
4799         }
4800
4801
4802         rt = container_of(dst, struct rt6_info, dst);
4803         if (rt->dst.error) {
4804                 err = rt->dst.error;
4805                 ip6_rt_put(rt);
4806                 goto errout;
4807         }
4808
4809         if (rt == net->ipv6.ip6_null_entry) {
4810                 err = rt->dst.error;
4811                 ip6_rt_put(rt);
4812                 goto errout;
4813         }
4814
4815         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4816         if (!skb) {
4817                 ip6_rt_put(rt);
4818                 err = -ENOBUFS;
4819                 goto errout;
4820         }
4821
4822         skb_dst_set(skb, &rt->dst);
4823
4824         rcu_read_lock();
4825         from = rcu_dereference(rt->from);
4826
4827         if (fibmatch)
4828                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4829                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4830                                     nlh->nlmsg_seq, 0);
4831         else
4832                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4833                                     &fl6.saddr, iif, RTM_NEWROUTE,
4834                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4835                                     0);
4836         rcu_read_unlock();
4837
4838         if (err < 0) {
4839                 kfree_skb(skb);
4840                 goto errout;
4841         }
4842
4843         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4844 errout:
4845         return err;
4846 }
4847
4848 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4849                      unsigned int nlm_flags)
4850 {
4851         struct sk_buff *skb;
4852         struct net *net = info->nl_net;
4853         u32 seq;
4854         int err;
4855
4856         err = -ENOBUFS;
4857         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4858
4859         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4860         if (!skb)
4861                 goto errout;
4862
4863         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4864                             event, info->portid, seq, nlm_flags);
4865         if (err < 0) {
4866                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4867                 WARN_ON(err == -EMSGSIZE);
4868                 kfree_skb(skb);
4869                 goto errout;
4870         }
4871         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4872                     info->nlh, gfp_any());
4873         return;
4874 errout:
4875         if (err < 0)
4876                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4877 }
4878
4879 static int ip6_route_dev_notify(struct notifier_block *this,
4880                                 unsigned long event, void *ptr)
4881 {
4882         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4883         struct net *net = dev_net(dev);
4884
4885         if (!(dev->flags & IFF_LOOPBACK))
4886                 return NOTIFY_OK;
4887
4888         if (event == NETDEV_REGISTER) {
4889                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4890                 net->ipv6.ip6_null_entry->dst.dev = dev;
4891                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4892 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4893                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4894                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4895                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4896                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4897 #endif
4898          } else if (event == NETDEV_UNREGISTER &&
4899                     dev->reg_state != NETREG_UNREGISTERED) {
4900                 /* NETDEV_UNREGISTER could be fired for multiple times by
4901                  * netdev_wait_allrefs(). Make sure we only call this once.
4902                  */
4903                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4904 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4905                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4906                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4907 #endif
4908         }
4909
4910         return NOTIFY_OK;
4911 }
4912
4913 /*
4914  *      /proc
4915  */
4916
4917 #ifdef CONFIG_PROC_FS
4918
4919 static const struct file_operations ipv6_route_proc_fops = {
4920         .open           = ipv6_route_open,
4921         .read           = seq_read,
4922         .llseek         = seq_lseek,
4923         .release        = seq_release_net,
4924 };
4925
4926 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4927 {
4928         struct net *net = (struct net *)seq->private;
4929         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4930                    net->ipv6.rt6_stats->fib_nodes,
4931                    net->ipv6.rt6_stats->fib_route_nodes,
4932                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4933                    net->ipv6.rt6_stats->fib_rt_entries,
4934                    net->ipv6.rt6_stats->fib_rt_cache,
4935                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4936                    net->ipv6.rt6_stats->fib_discarded_routes);
4937
4938         return 0;
4939 }
4940
4941 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4942 {
4943         return single_open_net(inode, file, rt6_stats_seq_show);
4944 }
4945
4946 static const struct file_operations rt6_stats_seq_fops = {
4947         .open    = rt6_stats_seq_open,
4948         .read    = seq_read,
4949         .llseek  = seq_lseek,
4950         .release = single_release_net,
4951 };
4952 #endif  /* CONFIG_PROC_FS */
4953
4954 #ifdef CONFIG_SYSCTL
4955
4956 static
4957 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4958                               void __user *buffer, size_t *lenp, loff_t *ppos)
4959 {
4960         struct net *net;
4961         int delay;
4962         if (!write)
4963                 return -EINVAL;
4964
4965         net = (struct net *)ctl->extra1;
4966         delay = net->ipv6.sysctl.flush_delay;
4967         proc_dointvec(ctl, write, buffer, lenp, ppos);
4968         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4969         return 0;
4970 }
4971
4972 struct ctl_table ipv6_route_table_template[] = {
4973         {
4974                 .procname       =       "flush",
4975                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4976                 .maxlen         =       sizeof(int),
4977                 .mode           =       0200,
4978                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4979         },
4980         {
4981                 .procname       =       "gc_thresh",
4982                 .data           =       &ip6_dst_ops_template.gc_thresh,
4983                 .maxlen         =       sizeof(int),
4984                 .mode           =       0644,
4985                 .proc_handler   =       proc_dointvec,
4986         },
4987         {
4988                 .procname       =       "max_size",
4989                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4990                 .maxlen         =       sizeof(int),
4991                 .mode           =       0644,
4992                 .proc_handler   =       proc_dointvec,
4993         },
4994         {
4995                 .procname       =       "gc_min_interval",
4996                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4997                 .maxlen         =       sizeof(int),
4998                 .mode           =       0644,
4999                 .proc_handler   =       proc_dointvec_jiffies,
5000         },
5001         {
5002                 .procname       =       "gc_timeout",
5003                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5004                 .maxlen         =       sizeof(int),
5005                 .mode           =       0644,
5006                 .proc_handler   =       proc_dointvec_jiffies,
5007         },
5008         {
5009                 .procname       =       "gc_interval",
5010                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5011                 .maxlen         =       sizeof(int),
5012                 .mode           =       0644,
5013                 .proc_handler   =       proc_dointvec_jiffies,
5014         },
5015         {
5016                 .procname       =       "gc_elasticity",
5017                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5018                 .maxlen         =       sizeof(int),
5019                 .mode           =       0644,
5020                 .proc_handler   =       proc_dointvec,
5021         },
5022         {
5023                 .procname       =       "mtu_expires",
5024                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5025                 .maxlen         =       sizeof(int),
5026                 .mode           =       0644,
5027                 .proc_handler   =       proc_dointvec_jiffies,
5028         },
5029         {
5030                 .procname       =       "min_adv_mss",
5031                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5032                 .maxlen         =       sizeof(int),
5033                 .mode           =       0644,
5034                 .proc_handler   =       proc_dointvec,
5035         },
5036         {
5037                 .procname       =       "gc_min_interval_ms",
5038                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5039                 .maxlen         =       sizeof(int),
5040                 .mode           =       0644,
5041                 .proc_handler   =       proc_dointvec_ms_jiffies,
5042         },
5043         { }
5044 };
5045
5046 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5047 {
5048         struct ctl_table *table;
5049
5050         table = kmemdup(ipv6_route_table_template,
5051                         sizeof(ipv6_route_table_template),
5052                         GFP_KERNEL);
5053
5054         if (table) {
5055                 table[0].data = &net->ipv6.sysctl.flush_delay;
5056                 table[0].extra1 = net;
5057                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5058                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5059                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5060                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5061                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5062                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5063                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5064                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5065                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5066
5067                 /* Don't export sysctls to unprivileged users */
5068                 if (net->user_ns != &init_user_ns)
5069                         table[0].procname = NULL;
5070         }
5071
5072         return table;
5073 }
5074 #endif
5075
5076 static int __net_init ip6_route_net_init(struct net *net)
5077 {
5078         int ret = -ENOMEM;
5079
5080         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5081                sizeof(net->ipv6.ip6_dst_ops));
5082
5083         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5084                 goto out_ip6_dst_ops;
5085
5086         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5087                                             sizeof(*net->ipv6.fib6_null_entry),
5088                                             GFP_KERNEL);
5089         if (!net->ipv6.fib6_null_entry)
5090                 goto out_ip6_dst_entries;
5091
5092         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5093                                            sizeof(*net->ipv6.ip6_null_entry),
5094                                            GFP_KERNEL);
5095         if (!net->ipv6.ip6_null_entry)
5096                 goto out_fib6_null_entry;
5097         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5098         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5099                          ip6_template_metrics, true);
5100
5101 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5102         net->ipv6.fib6_has_custom_rules = false;
5103         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5104                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5105                                                GFP_KERNEL);
5106         if (!net->ipv6.ip6_prohibit_entry)
5107                 goto out_ip6_null_entry;
5108         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5109         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5110                          ip6_template_metrics, true);
5111
5112         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5113                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5114                                                GFP_KERNEL);
5115         if (!net->ipv6.ip6_blk_hole_entry)
5116                 goto out_ip6_prohibit_entry;
5117         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5118         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5119                          ip6_template_metrics, true);
5120 #endif
5121
5122         net->ipv6.sysctl.flush_delay = 0;
5123         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5124         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5125         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5126         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5127         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5128         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5129         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5130
5131         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5132
5133         ret = 0;
5134 out:
5135         return ret;
5136
5137 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5138 out_ip6_prohibit_entry:
5139         kfree(net->ipv6.ip6_prohibit_entry);
5140 out_ip6_null_entry:
5141         kfree(net->ipv6.ip6_null_entry);
5142 #endif
5143 out_fib6_null_entry:
5144         kfree(net->ipv6.fib6_null_entry);
5145 out_ip6_dst_entries:
5146         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5147 out_ip6_dst_ops:
5148         goto out;
5149 }
5150
5151 static void __net_exit ip6_route_net_exit(struct net *net)
5152 {
5153         kfree(net->ipv6.fib6_null_entry);
5154         kfree(net->ipv6.ip6_null_entry);
5155 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5156         kfree(net->ipv6.ip6_prohibit_entry);
5157         kfree(net->ipv6.ip6_blk_hole_entry);
5158 #endif
5159         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5160 }
5161
5162 static int __net_init ip6_route_net_init_late(struct net *net)
5163 {
5164 #ifdef CONFIG_PROC_FS
5165         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5166         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5167 #endif
5168         return 0;
5169 }
5170
5171 static void __net_exit ip6_route_net_exit_late(struct net *net)
5172 {
5173 #ifdef CONFIG_PROC_FS
5174         remove_proc_entry("ipv6_route", net->proc_net);
5175         remove_proc_entry("rt6_stats", net->proc_net);
5176 #endif
5177 }
5178
5179 static struct pernet_operations ip6_route_net_ops = {
5180         .init = ip6_route_net_init,
5181         .exit = ip6_route_net_exit,
5182 };
5183
5184 static int __net_init ipv6_inetpeer_init(struct net *net)
5185 {
5186         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5187
5188         if (!bp)
5189                 return -ENOMEM;
5190         inet_peer_base_init(bp);
5191         net->ipv6.peers = bp;
5192         return 0;
5193 }
5194
5195 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5196 {
5197         struct inet_peer_base *bp = net->ipv6.peers;
5198
5199         net->ipv6.peers = NULL;
5200         inetpeer_invalidate_tree(bp);
5201         kfree(bp);
5202 }
5203
5204 static struct pernet_operations ipv6_inetpeer_ops = {
5205         .init   =       ipv6_inetpeer_init,
5206         .exit   =       ipv6_inetpeer_exit,
5207 };
5208
5209 static struct pernet_operations ip6_route_net_late_ops = {
5210         .init = ip6_route_net_init_late,
5211         .exit = ip6_route_net_exit_late,
5212 };
5213
5214 static struct notifier_block ip6_route_dev_notifier = {
5215         .notifier_call = ip6_route_dev_notify,
5216         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5217 };
5218
5219 void __init ip6_route_init_special_entries(void)
5220 {
5221         /* Registering of the loopback is done before this portion of code,
5222          * the loopback reference in rt6_info will not be taken, do it
5223          * manually for init_net */
5224         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5225         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5226         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5227   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5228         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5229         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5230         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5231         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5232   #endif
5233 }
5234
5235 int __init ip6_route_init(void)
5236 {
5237         int ret;
5238         int cpu;
5239
5240         ret = -ENOMEM;
5241         ip6_dst_ops_template.kmem_cachep =
5242                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5243                                   SLAB_HWCACHE_ALIGN, NULL);
5244         if (!ip6_dst_ops_template.kmem_cachep)
5245                 goto out;
5246
5247         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5248         if (ret)
5249                 goto out_kmem_cache;
5250
5251         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5252         if (ret)
5253                 goto out_dst_entries;
5254
5255         ret = register_pernet_subsys(&ip6_route_net_ops);
5256         if (ret)
5257                 goto out_register_inetpeer;
5258
5259         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5260
5261         ret = fib6_init();
5262         if (ret)
5263                 goto out_register_subsys;
5264
5265         ret = xfrm6_init();
5266         if (ret)
5267                 goto out_fib6_init;
5268
5269         ret = fib6_rules_init();
5270         if (ret)
5271                 goto xfrm6_init;
5272
5273         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5274         if (ret)
5275                 goto fib6_rules_init;
5276
5277         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5278                                    inet6_rtm_newroute, NULL, 0);
5279         if (ret < 0)
5280                 goto out_register_late_subsys;
5281
5282         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5283                                    inet6_rtm_delroute, NULL, 0);
5284         if (ret < 0)
5285                 goto out_register_late_subsys;
5286
5287         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5288                                    inet6_rtm_getroute, NULL,
5289                                    RTNL_FLAG_DOIT_UNLOCKED);
5290         if (ret < 0)
5291                 goto out_register_late_subsys;
5292
5293         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5294         if (ret)
5295                 goto out_register_late_subsys;
5296
5297         for_each_possible_cpu(cpu) {
5298                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5299
5300                 INIT_LIST_HEAD(&ul->head);
5301                 spin_lock_init(&ul->lock);
5302         }
5303
5304 out:
5305         return ret;
5306
5307 out_register_late_subsys:
5308         rtnl_unregister_all(PF_INET6);
5309         unregister_pernet_subsys(&ip6_route_net_late_ops);
5310 fib6_rules_init:
5311         fib6_rules_cleanup();
5312 xfrm6_init:
5313         xfrm6_fini();
5314 out_fib6_init:
5315         fib6_gc_cleanup();
5316 out_register_subsys:
5317         unregister_pernet_subsys(&ip6_route_net_ops);
5318 out_register_inetpeer:
5319         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5320 out_dst_entries:
5321         dst_entries_destroy(&ip6_dst_blackhole_ops);
5322 out_kmem_cache:
5323         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5324         goto out;
5325 }
5326
5327 void ip6_route_cleanup(void)
5328 {
5329         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5330         unregister_pernet_subsys(&ip6_route_net_late_ops);
5331         fib6_rules_cleanup();
5332         xfrm6_fini();
5333         fib6_gc_cleanup();
5334         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5335         unregister_pernet_subsys(&ip6_route_net_ops);
5336         dst_entries_destroy(&ip6_dst_blackhole_ops);
5337         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5338 }