2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 struct fib6_info *rt, struct dst_entry *dst,
110 struct in6_addr *dest, struct in6_addr *src,
111 int iif, int type, u32 portid, u32 seq,
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 const struct in6_addr *daddr,
115 const struct in6_addr *saddr);
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev,
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 const struct in6_addr *prefix, int prefixlen,
125 const struct in6_addr *gwaddr,
126 struct net_device *dev);
129 struct uncached_list {
131 struct list_head head;
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
136 void rt6_uncached_list_add(struct rt6_info *rt)
138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
140 rt->rt6i_uncached_list = ul;
142 spin_lock_bh(&ul->lock);
143 list_add_tail(&rt->rt6i_uncached, &ul->head);
144 spin_unlock_bh(&ul->lock);
147 void rt6_uncached_list_del(struct rt6_info *rt)
149 if (!list_empty(&rt->rt6i_uncached)) {
150 struct uncached_list *ul = rt->rt6i_uncached_list;
151 struct net *net = dev_net(rt->dst.dev);
153 spin_lock_bh(&ul->lock);
154 list_del(&rt->rt6i_uncached);
155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 spin_unlock_bh(&ul->lock);
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
162 struct net_device *loopback_dev = net->loopback_dev;
165 if (dev == loopback_dev)
168 for_each_possible_cpu(cpu) {
169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
172 spin_lock_bh(&ul->lock);
173 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 struct inet6_dev *rt_idev = rt->rt6i_idev;
175 struct net_device *rt_dev = rt->dst.dev;
177 if (rt_idev->dev == dev) {
178 rt->rt6i_idev = in6_dev_get(loopback_dev);
179 in6_dev_put(rt_idev);
183 rt->dst.dev = loopback_dev;
184 dev_hold(rt->dst.dev);
188 spin_unlock_bh(&ul->lock);
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
196 if (!ipv6_addr_any(p))
197 return (const void *) p;
199 return &ipv6_hdr(skb)->daddr;
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 struct net_device *dev,
210 daddr = choose_neigh_daddr(gw, skb, daddr);
211 n = __ipv6_neigh_lookup(dev, daddr);
215 n = neigh_create(&nd_tbl, daddr, dev);
216 return IS_ERR(n) ? NULL : n;
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 struct net_device *dev = dst->dev;
231 struct rt6_info *rt = (struct rt6_info *)dst;
233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240 __ipv6_confirm_neigh(dev, daddr);
243 static struct dst_ops ip6_dst_ops_template = {
247 .check = ip6_dst_check,
248 .default_advmss = ip6_default_advmss,
250 .cow_metrics = dst_cow_metrics_generic,
251 .destroy = ip6_dst_destroy,
252 .ifdown = ip6_dst_ifdown,
253 .negative_advice = ip6_negative_advice,
254 .link_failure = ip6_link_failure,
255 .update_pmtu = ip6_rt_update_pmtu,
256 .redirect = rt6_do_redirect,
257 .local_out = __ip6_local_out,
258 .neigh_lookup = ip6_dst_neigh_lookup,
259 .confirm_neigh = ip6_confirm_neigh,
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266 return mtu ? : dst->dev->mtu;
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 struct sk_buff *skb, u32 mtu)
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
279 static struct dst_ops ip6_dst_blackhole_ops = {
281 .destroy = ip6_dst_destroy,
282 .check = ip6_dst_check,
283 .mtu = ip6_blackhole_mtu,
284 .default_advmss = ip6_default_advmss,
285 .update_pmtu = ip6_rt_blackhole_update_pmtu,
286 .redirect = ip6_rt_blackhole_redirect,
287 .cow_metrics = dst_cow_metrics_generic,
288 .neigh_lookup = ip6_dst_neigh_lookup,
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 [RTAX_HOPLIMIT - 1] = 0,
295 static const struct fib6_info fib6_null_entry_template = {
296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .fib6_protocol = RTPROT_KERNEL,
298 .fib6_metric = ~(u32)0,
299 .fib6_ref = REFCOUNT_INIT(1),
300 .fib6_type = RTN_UNREACHABLE,
301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
304 static const struct rt6_info ip6_null_entry_template = {
306 .__refcnt = ATOMIC_INIT(1),
308 .obsolete = DST_OBSOLETE_FORCE_CHK,
309 .error = -ENETUNREACH,
310 .input = ip6_pkt_discard,
311 .output = ip6_pkt_discard_out,
313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
318 static const struct rt6_info ip6_prohibit_entry_template = {
320 .__refcnt = ATOMIC_INIT(1),
322 .obsolete = DST_OBSOLETE_FORCE_CHK,
324 .input = ip6_pkt_prohibit,
325 .output = ip6_pkt_prohibit_out,
327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
344 static void rt6_info_init(struct rt6_info *rt)
346 struct dst_entry *dst = &rt->dst;
348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 1, DST_OBSOLETE_FORCE_CHK, flags);
361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
366 EXPORT_SYMBOL(ip6_dst_alloc);
368 static void ip6_dst_destroy(struct dst_entry *dst)
370 struct rt6_info *rt = (struct rt6_info *)dst;
371 struct fib6_info *from;
372 struct inet6_dev *idev;
374 ip_dst_metrics_put(dst);
375 rt6_uncached_list_del(rt);
377 idev = rt->rt6i_idev;
379 rt->rt6i_idev = NULL;
383 from = xchg((__force struct fib6_info **)&rt->from, NULL);
384 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429 struct flowi6 *fl6, int oif, bool have_oif_match,
430 const struct sk_buff *skb, int strict)
432 struct fib6_info *sibling, *next_sibling;
433 struct fib6_info *match = res->f6i;
435 if (!match->fib6_nsiblings || have_oif_match)
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 const struct fib6_nh *nh = &sibling->fib6_nh;
452 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453 if (fl6->mp_hash > nh_upper_bound)
455 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
463 res->nh = &match->fib6_nh;
467 * Route lookup. rcu_read_lock() should be held.
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471 const struct in6_addr *saddr, int oif, int flags)
473 const struct net_device *dev;
475 if (nh->fib_nh_flags & RTNH_F_DEAD)
478 dev = nh->fib_nh_dev;
480 if (dev->ifindex == oif)
483 if (ipv6_chk_addr(net, saddr, dev,
484 flags & RT6_LOOKUP_F_IFACE))
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492 const struct in6_addr *saddr, int oif, int flags)
494 struct fib6_info *f6i = res->f6i;
495 struct fib6_info *spf6i;
498 if (!oif && ipv6_addr_any(saddr)) {
500 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
504 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505 nh = &spf6i->fib6_nh;
506 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
512 if (oif && flags & RT6_LOOKUP_F_IFACE) {
513 res->f6i = net->ipv6.fib6_null_entry;
514 nh = &res->f6i->fib6_nh;
519 if (nh->fib_nh_flags & RTNH_F_DEAD) {
520 res->f6i = net->ipv6.fib6_null_entry;
521 nh = &res->f6i->fib6_nh;
525 res->fib6_type = res->f6i->fib6_type;
526 res->fib6_flags = res->f6i->fib6_flags;
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531 struct work_struct work;
532 struct in6_addr target;
533 struct net_device *dev;
536 static void rt6_probe_deferred(struct work_struct *w)
538 struct in6_addr mcaddr;
539 struct __rt6_probe_work *work =
540 container_of(w, struct __rt6_probe_work, work);
542 addrconf_addr_solict_mult(&work->target, &mcaddr);
543 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
548 static void rt6_probe(struct fib6_nh *fib6_nh)
550 struct __rt6_probe_work *work = NULL;
551 const struct in6_addr *nh_gw;
552 struct neighbour *neigh;
553 struct net_device *dev;
554 struct inet6_dev *idev;
557 * Okay, this does not seem to be appropriate
558 * for now, however, we need to check if it
559 * is really so; aka Router Reachability Probing.
561 * Router Reachability Probe MUST be rate-limited
562 * to no more than one per minute.
564 if (fib6_nh->fib_nh_gw_family)
567 nh_gw = &fib6_nh->fib_nh_gw6;
568 dev = fib6_nh->fib_nh_dev;
570 idev = __in6_dev_get(dev);
571 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
573 if (neigh->nud_state & NUD_VALID)
576 write_lock(&neigh->lock);
577 if (!(neigh->nud_state & NUD_VALID) &&
579 neigh->updated + idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 __neigh_set_probe_once(neigh);
584 write_unlock(&neigh->lock);
585 } else if (time_after(jiffies, fib6_nh->last_probe +
586 idev->cnf.rtr_probe_interval)) {
587 work = kmalloc(sizeof(*work), GFP_ATOMIC);
591 fib6_nh->last_probe = jiffies;
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = *nh_gw;
596 schedule_work(&work->work);
600 rcu_read_unlock_bh();
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
609 * Default Router Selection (RFC 2461 6.3.6)
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
613 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614 struct neighbour *neigh;
617 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618 &fib6_nh->fib_nh_gw6);
620 read_lock(&neigh->lock);
621 if (neigh->nud_state & NUD_VALID)
622 ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 else if (!(neigh->nud_state & NUD_FAILED))
625 ret = RT6_NUD_SUCCEED;
627 ret = RT6_NUD_FAIL_PROBE;
629 read_unlock(&neigh->lock);
631 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634 rcu_read_unlock_bh();
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
644 if (!oif || nh->fib_nh_dev->ifindex == oif)
647 if (!m && (strict & RT6_LOOKUP_F_IFACE))
648 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
652 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654 int n = rt6_check_neigh(nh);
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662 int oif, int strict, int *mpri, bool *do_rr)
664 bool match_do_rr = false;
668 if (nh->fib_nh_flags & RTNH_F_DEAD)
671 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
676 m = rt6_score_route(nh, fib6_flags, oif, strict);
677 if (m == RT6_NUD_FAIL_DO_RR) {
679 m = 0; /* lowest valid score */
680 } else if (m == RT6_NUD_FAIL_HARD) {
684 if (strict & RT6_LOOKUP_F_REACHABLE)
687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
689 *do_rr = match_do_rr;
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698 struct fib6_info *nomatch, u32 metric,
699 struct fib6_result *res, struct fib6_info **cont,
700 int oif, int strict, bool *do_rr, int *mpri)
702 struct fib6_info *f6i;
704 for (f6i = f6i_start;
705 f6i && f6i != nomatch;
706 f6i = rcu_dereference(f6i->fib6_next)) {
709 if (cont && f6i->fib6_metric != metric) {
714 if (fib6_check_expired(f6i))
718 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
721 res->fib6_flags = f6i->fib6_flags;
722 res->fib6_type = f6i->fib6_type;
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728 struct fib6_info *rr_head, int oif, int strict,
729 bool *do_rr, struct fib6_result *res)
731 u32 metric = rr_head->fib6_metric;
732 struct fib6_info *cont = NULL;
735 __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736 oif, strict, do_rr, &mpri);
738 __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739 oif, strict, do_rr, &mpri);
741 if (res->f6i || !cont)
744 __find_rr_leaf(cont, NULL, metric, res, NULL,
745 oif, strict, do_rr, &mpri);
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749 struct fib6_result *res, int strict)
751 struct fib6_info *leaf = rcu_dereference(fn->leaf);
752 struct fib6_info *rt0;
756 /* make sure this function or its helpers sets f6i */
759 if (!leaf || leaf == net->ipv6.fib6_null_entry)
762 rt0 = rcu_dereference(fn->rr_ptr);
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
771 key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->fib6_src.plen)
774 key_plen = rt0->fib6_src.plen;
776 if (fn->fn_bit != key_plen)
779 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
781 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
783 /* no entries matched; do round-robin */
784 if (!next || next->fib6_metric != rt0->fib6_metric)
788 spin_lock_bh(&leaf->fib6_table->tb6_lock);
789 /* make sure next is not being deleted from the tree */
791 rcu_assign_pointer(fn->rr_ptr, next);
792 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
798 res->f6i = net->ipv6.fib6_null_entry;
799 res->nh = &res->f6i->fib6_nh;
800 res->fib6_flags = res->f6i->fib6_flags;
801 res->fib6_type = res->f6i->fib6_type;
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
807 return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808 res->nh->fib_nh_gw_family;
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813 const struct in6_addr *gwaddr)
815 struct net *net = dev_net(dev);
816 struct route_info *rinfo = (struct route_info *) opt;
817 struct in6_addr prefix_buf, *prefix;
819 unsigned long lifetime;
820 struct fib6_info *rt;
822 if (len < sizeof(struct route_info)) {
826 /* Sanity check for prefix_len and length */
827 if (rinfo->length > 3) {
829 } else if (rinfo->prefix_len > 128) {
831 } else if (rinfo->prefix_len > 64) {
832 if (rinfo->length < 2) {
835 } else if (rinfo->prefix_len > 0) {
836 if (rinfo->length < 1) {
841 pref = rinfo->route_pref;
842 if (pref == ICMPV6_ROUTER_PREF_INVALID)
845 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
847 if (rinfo->length == 3)
848 prefix = (struct in6_addr *)rinfo->prefix;
850 /* this function is safe */
851 ipv6_addr_prefix(&prefix_buf,
852 (struct in6_addr *)rinfo->prefix,
854 prefix = &prefix_buf;
857 if (rinfo->prefix_len == 0)
858 rt = rt6_get_dflt_router(net, gwaddr, dev);
860 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
863 if (rt && !lifetime) {
869 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
872 rt->fib6_flags = RTF_ROUTEINFO |
873 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
876 if (!addrconf_finite_timeout(lifetime))
877 fib6_clean_expires(rt);
879 fib6_set_expires(rt, jiffies + HZ * lifetime);
881 fib6_info_release(rt);
888 * Misc support functions
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
894 struct net_device *dev = res->nh->fib_nh_dev;
896 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897 /* for copies of local routes, dst->dev needs to be the
898 * device if it is a master device, the master device if
899 * device is enslaved, and the loopback as the default
901 if (netif_is_l3_slave(dev) &&
902 !rt6_need_strict(&res->f6i->fib6_dst.addr))
903 dev = l3mdev_master_dev_rcu(dev);
904 else if (!netif_is_l3_master(dev))
905 dev = dev_net(dev)->loopback_dev;
906 /* last case is netif_is_l3_master(dev) is true in which
907 * case we want dev returned to be dev
914 static const int fib6_prop[RTN_MAX + 1] = {
921 [RTN_BLACKHOLE] = -EINVAL,
922 [RTN_UNREACHABLE] = -EHOSTUNREACH,
923 [RTN_PROHIBIT] = -EACCES,
924 [RTN_THROW] = -EAGAIN,
926 [RTN_XRESOLVE] = -EINVAL,
929 static int ip6_rt_type_to_error(u8 fib6_type)
931 return fib6_prop[fib6_type];
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
936 unsigned short flags = 0;
939 flags |= DST_NOCOUNT;
940 if (rt->dst_nopolicy)
941 flags |= DST_NOPOLICY;
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
950 rt->dst.error = ip6_rt_type_to_error(fib6_type);
954 rt->dst.output = dst_discard_out;
955 rt->dst.input = dst_discard;
958 rt->dst.output = ip6_pkt_prohibit_out;
959 rt->dst.input = ip6_pkt_prohibit;
962 case RTN_UNREACHABLE:
964 rt->dst.output = ip6_pkt_discard_out;
965 rt->dst.input = ip6_pkt_discard;
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
972 struct fib6_info *f6i = res->f6i;
974 if (res->fib6_flags & RTF_REJECT) {
975 ip6_rt_init_dst_reject(rt, res->fib6_type);
980 rt->dst.output = ip6_output;
982 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983 rt->dst.input = ip6_input;
984 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985 rt->dst.input = ip6_mc_input;
987 rt->dst.input = ip6_forward;
990 if (res->nh->fib_nh_lws) {
991 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992 lwtunnel_set_redirect(&rt->dst);
995 rt->dst.lastuse = jiffies;
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1001 rt->rt6i_flags &= ~RTF_EXPIRES;
1002 rcu_assign_pointer(rt->from, from);
1003 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1009 const struct fib6_nh *nh = res->nh;
1010 const struct net_device *dev = nh->fib_nh_dev;
1011 struct fib6_info *f6i = res->f6i;
1013 ip6_rt_init_dst(rt, res);
1015 rt->rt6i_dst = f6i->fib6_dst;
1016 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017 rt->rt6i_flags = res->fib6_flags;
1018 if (nh->fib_nh_gw_family) {
1019 rt->rt6i_gateway = nh->fib_nh_gw6;
1020 rt->rt6i_flags |= RTF_GATEWAY;
1022 rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024 rt->rt6i_src = f6i->fib6_src;
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029 struct in6_addr *saddr)
1031 struct fib6_node *pn, *sn;
1033 if (fn->fn_flags & RTN_TL_ROOT)
1035 pn = rcu_dereference(fn->parent);
1036 sn = FIB6_SUBTREE(pn);
1038 fn = fib6_node_lookup(sn, NULL, saddr);
1041 if (fn->fn_flags & RTN_RTINFO)
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1048 struct rt6_info *rt = *prt;
1050 if (dst_hold_safe(&rt->dst))
1053 rt = net->ipv6.ip6_null_entry;
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1065 struct net_device *dev = res->nh->fib_nh_dev;
1066 struct fib6_info *f6i = res->f6i;
1067 unsigned short flags;
1068 struct rt6_info *nrt;
1070 if (!fib6_info_hold_safe(f6i))
1073 flags = fib6_info_dst_flags(f6i);
1074 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1076 fib6_info_release(f6i);
1080 ip6_rt_copy_init(nrt, res);
1084 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085 dst_hold(&nrt->dst);
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090 struct fib6_table *table,
1092 const struct sk_buff *skb,
1095 struct fib6_result res = {};
1096 struct fib6_node *fn;
1097 struct rt6_info *rt;
1099 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100 flags &= ~RT6_LOOKUP_F_IFACE;
1103 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1105 res.f6i = rcu_dereference(fn->leaf);
1107 res.f6i = net->ipv6.fib6_null_entry;
1109 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1112 if (res.f6i == net->ipv6.fib6_null_entry) {
1113 fn = fib6_backtrack(fn, &fl6->saddr);
1117 rt = net->ipv6.ip6_null_entry;
1122 fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123 fl6->flowi6_oif != 0, skb, flags);
1125 /* Search through exception table */
1126 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1128 if (ip6_hold_safe(net, &rt))
1129 dst_use_noref(&rt->dst, jiffies);
1131 rt = ip6_create_rt_rcu(&res);
1135 trace_fib6_table_lookup(net, &res, table, fl6);
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143 const struct sk_buff *skb, int flags)
1145 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150 const struct in6_addr *saddr, int oif,
1151 const struct sk_buff *skb, int strict)
1153 struct flowi6 fl6 = {
1157 struct dst_entry *dst;
1158 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1161 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162 flags |= RT6_LOOKUP_F_HAS_SADDR;
1165 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166 if (dst->error == 0)
1167 return (struct rt6_info *) dst;
1173 EXPORT_SYMBOL(rt6_lookup);
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176 * It takes new route entry, the addition fails by any reason the
1177 * route is released.
1178 * Caller must hold dst before calling it.
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182 struct netlink_ext_ack *extack)
1185 struct fib6_table *table;
1187 table = rt->fib6_table;
1188 spin_lock_bh(&table->tb6_lock);
1189 err = fib6_add(&table->tb6_root, rt, info, extack);
1190 spin_unlock_bh(&table->tb6_lock);
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1197 struct nl_info info = { .nl_net = net, };
1199 return __ip6_ins_rt(rt, &info, NULL);
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203 const struct in6_addr *daddr,
1204 const struct in6_addr *saddr)
1206 struct fib6_info *f6i = res->f6i;
1207 struct net_device *dev;
1208 struct rt6_info *rt;
1214 if (!fib6_info_hold_safe(f6i))
1217 dev = ip6_rt_get_dev_rcu(res);
1218 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1220 fib6_info_release(f6i);
1224 ip6_rt_copy_init(rt, res);
1225 rt->rt6i_flags |= RTF_CACHE;
1226 rt->dst.flags |= DST_HOST;
1227 rt->rt6i_dst.addr = *daddr;
1228 rt->rt6i_dst.plen = 128;
1230 if (!rt6_is_gw_or_nonexthop(res)) {
1231 if (f6i->fib6_dst.plen != 128 &&
1232 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233 rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235 if (rt->rt6i_src.plen && saddr) {
1236 rt->rt6i_src.addr = *saddr;
1237 rt->rt6i_src.plen = 128;
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1247 struct fib6_info *f6i = res->f6i;
1248 unsigned short flags = fib6_info_dst_flags(f6i);
1249 struct net_device *dev;
1250 struct rt6_info *pcpu_rt;
1252 if (!fib6_info_hold_safe(f6i))
1256 dev = ip6_rt_get_dev_rcu(res);
1257 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1260 fib6_info_release(f6i);
1263 ip6_rt_copy_init(pcpu_rt, res);
1264 pcpu_rt->rt6i_flags |= RTF_PCPU;
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1271 struct rt6_info *pcpu_rt, **p;
1273 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1277 ip6_hold_safe(NULL, &pcpu_rt);
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283 const struct fib6_result *res)
1285 struct rt6_info *pcpu_rt, *prev, **p;
1287 pcpu_rt = ip6_rt_pcpu_alloc(res);
1289 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290 return net->ipv6.ip6_null_entry;
1293 dst_hold(&pcpu_rt->dst);
1294 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295 prev = cmpxchg(p, NULL, pcpu_rt);
1298 if (res->f6i->fib6_destroying) {
1299 struct fib6_info *from;
1301 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1302 fib6_info_release(from);
1308 /* exception hash table implementation
1310 static DEFINE_SPINLOCK(rt6_exception_lock);
1312 /* Remove rt6_ex from hash table and free the memory
1313 * Caller must hold rt6_exception_lock
1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1316 struct rt6_exception *rt6_ex)
1318 struct fib6_info *from;
1321 if (!bucket || !rt6_ex)
1324 net = dev_net(rt6_ex->rt6i->dst.dev);
1325 net->ipv6.rt6_stats->fib_rt_cache--;
1327 /* purge completely the exception to allow releasing the held resources:
1328 * some [sk] cache may keep the dst around for unlimited time
1330 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1331 fib6_info_release(from);
1332 dst_dev_put(&rt6_ex->rt6i->dst);
1334 hlist_del_rcu(&rt6_ex->hlist);
1335 dst_release(&rt6_ex->rt6i->dst);
1336 kfree_rcu(rt6_ex, rcu);
1337 WARN_ON_ONCE(!bucket->depth);
1341 /* Remove oldest rt6_ex in bucket and free the memory
1342 * Caller must hold rt6_exception_lock
1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1346 struct rt6_exception *rt6_ex, *oldest = NULL;
1351 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1352 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1355 rt6_remove_exception(bucket, oldest);
1358 static u32 rt6_exception_hash(const struct in6_addr *dst,
1359 const struct in6_addr *src)
1361 static u32 seed __read_mostly;
1364 net_get_random_once(&seed, sizeof(seed));
1365 val = jhash(dst, sizeof(*dst), seed);
1367 #ifdef CONFIG_IPV6_SUBTREES
1369 val = jhash(src, sizeof(*src), val);
1371 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1374 /* Helper function to find the cached rt in the hash table
1375 * and update bucket pointer to point to the bucket for this
1376 * (daddr, saddr) pair
1377 * Caller must hold rt6_exception_lock
1379 static struct rt6_exception *
1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1381 const struct in6_addr *daddr,
1382 const struct in6_addr *saddr)
1384 struct rt6_exception *rt6_ex;
1387 if (!(*bucket) || !daddr)
1390 hval = rt6_exception_hash(daddr, saddr);
1393 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1394 struct rt6_info *rt6 = rt6_ex->rt6i;
1395 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 if (matched && saddr)
1399 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1407 /* Helper function to find the cached rt in the hash table
1408 * and update bucket pointer to point to the bucket for this
1409 * (daddr, saddr) pair
1410 * Caller must hold rcu_read_lock()
1412 static struct rt6_exception *
1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1414 const struct in6_addr *daddr,
1415 const struct in6_addr *saddr)
1417 struct rt6_exception *rt6_ex;
1420 WARN_ON_ONCE(!rcu_read_lock_held());
1422 if (!(*bucket) || !daddr)
1425 hval = rt6_exception_hash(daddr, saddr);
1428 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1429 struct rt6_info *rt6 = rt6_ex->rt6i;
1430 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1432 #ifdef CONFIG_IPV6_SUBTREES
1433 if (matched && saddr)
1434 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1442 static unsigned int fib6_mtu(const struct fib6_result *res)
1444 const struct fib6_nh *nh = res->nh;
1447 if (res->f6i->fib6_pmtu) {
1448 mtu = res->f6i->fib6_pmtu;
1450 struct net_device *dev = nh->fib_nh_dev;
1451 struct inet6_dev *idev;
1454 idev = __in6_dev_get(dev);
1455 mtu = idev->cnf.mtu6;
1459 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1461 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1464 static int rt6_insert_exception(struct rt6_info *nrt,
1465 const struct fib6_result *res)
1467 struct net *net = dev_net(nrt->dst.dev);
1468 struct rt6_exception_bucket *bucket;
1469 struct in6_addr *src_key = NULL;
1470 struct rt6_exception *rt6_ex;
1471 struct fib6_info *f6i = res->f6i;
1474 spin_lock_bh(&rt6_exception_lock);
1476 if (f6i->exception_bucket_flushed) {
1481 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1482 lockdep_is_held(&rt6_exception_lock));
1484 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1490 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1493 #ifdef CONFIG_IPV6_SUBTREES
1494 /* fib6_src.plen != 0 indicates f6i is in subtree
1495 * and exception table is indexed by a hash of
1496 * both fib6_dst and fib6_src.
1497 * Otherwise, the exception table is indexed by
1498 * a hash of only fib6_dst.
1500 if (f6i->fib6_src.plen)
1501 src_key = &nrt->rt6i_src.addr;
1503 /* rt6_mtu_change() might lower mtu on f6i.
1504 * Only insert this exception route if its mtu
1505 * is less than f6i's mtu value.
1507 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1512 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1515 rt6_remove_exception(bucket, rt6_ex);
1517 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1523 rt6_ex->stamp = jiffies;
1524 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1526 net->ipv6.rt6_stats->fib_rt_cache++;
1528 if (bucket->depth > FIB6_MAX_DEPTH)
1529 rt6_exception_remove_oldest(bucket);
1532 spin_unlock_bh(&rt6_exception_lock);
1534 /* Update fn->fn_sernum to invalidate all cached dst */
1536 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1537 fib6_update_sernum(net, f6i);
1538 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1539 fib6_force_start_gc(net);
1545 void rt6_flush_exceptions(struct fib6_info *rt)
1547 struct rt6_exception_bucket *bucket;
1548 struct rt6_exception *rt6_ex;
1549 struct hlist_node *tmp;
1552 spin_lock_bh(&rt6_exception_lock);
1553 /* Prevent rt6_insert_exception() to recreate the bucket list */
1554 rt->exception_bucket_flushed = 1;
1556 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1557 lockdep_is_held(&rt6_exception_lock));
1561 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1563 rt6_remove_exception(bucket, rt6_ex);
1564 WARN_ON_ONCE(bucket->depth);
1569 spin_unlock_bh(&rt6_exception_lock);
1572 /* Find cached rt in the hash table inside passed in rt
1573 * Caller has to hold rcu_read_lock()
1575 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1576 const struct in6_addr *daddr,
1577 const struct in6_addr *saddr)
1579 const struct in6_addr *src_key = NULL;
1580 struct rt6_exception_bucket *bucket;
1581 struct rt6_exception *rt6_ex;
1582 struct rt6_info *ret = NULL;
1584 #ifdef CONFIG_IPV6_SUBTREES
1585 /* fib6i_src.plen != 0 indicates f6i is in subtree
1586 * and exception table is indexed by a hash of
1587 * both fib6_dst and fib6_src.
1588 * However, the src addr used to create the hash
1589 * might not be exactly the passed in saddr which
1590 * is a /128 addr from the flow.
1591 * So we need to use f6i->fib6_src to redo lookup
1592 * if the passed in saddr does not find anything.
1593 * (See the logic in ip6_rt_cache_alloc() on how
1594 * rt->rt6i_src is updated.)
1596 if (res->f6i->fib6_src.plen)
1600 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1601 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1603 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1606 #ifdef CONFIG_IPV6_SUBTREES
1607 /* Use fib6_src as src_key and redo lookup */
1608 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1609 src_key = &res->f6i->fib6_src.addr;
1617 /* Remove the passed in cached rt from the hash table that contains it */
1618 static int rt6_remove_exception_rt(struct rt6_info *rt)
1620 struct rt6_exception_bucket *bucket;
1621 struct in6_addr *src_key = NULL;
1622 struct rt6_exception *rt6_ex;
1623 struct fib6_info *from;
1626 from = rcu_dereference(rt->from);
1628 !(rt->rt6i_flags & RTF_CACHE))
1631 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1634 spin_lock_bh(&rt6_exception_lock);
1635 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1636 lockdep_is_held(&rt6_exception_lock));
1637 #ifdef CONFIG_IPV6_SUBTREES
1638 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1639 * and exception table is indexed by a hash of
1640 * both rt6i_dst and rt6i_src.
1641 * Otherwise, the exception table is indexed by
1642 * a hash of only rt6i_dst.
1644 if (from->fib6_src.plen)
1645 src_key = &rt->rt6i_src.addr;
1647 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1651 rt6_remove_exception(bucket, rt6_ex);
1657 spin_unlock_bh(&rt6_exception_lock);
1661 /* Find rt6_ex which contains the passed in rt cache and
1664 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1666 struct rt6_exception_bucket *bucket;
1667 struct in6_addr *src_key = NULL;
1668 struct rt6_exception *rt6_ex;
1669 struct fib6_info *from;
1672 from = rcu_dereference(rt->from);
1673 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1676 bucket = rcu_dereference(from->rt6i_exception_bucket);
1678 #ifdef CONFIG_IPV6_SUBTREES
1679 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1680 * and exception table is indexed by a hash of
1681 * both rt6i_dst and rt6i_src.
1682 * Otherwise, the exception table is indexed by
1683 * a hash of only rt6i_dst.
1685 if (from->fib6_src.plen)
1686 src_key = &rt->rt6i_src.addr;
1688 rt6_ex = __rt6_find_exception_rcu(&bucket,
1692 rt6_ex->stamp = jiffies;
1698 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1699 struct rt6_info *rt, int mtu)
1701 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1702 * lowest MTU in the path: always allow updating the route PMTU to
1703 * reflect PMTU decreases.
1705 * If the new MTU is higher, and the route PMTU is equal to the local
1706 * MTU, this means the old MTU is the lowest in the path, so allow
1707 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1711 if (dst_mtu(&rt->dst) >= mtu)
1714 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1720 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1721 struct fib6_info *rt, int mtu)
1723 struct rt6_exception_bucket *bucket;
1724 struct rt6_exception *rt6_ex;
1727 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728 lockdep_is_held(&rt6_exception_lock));
1733 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1734 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1735 struct rt6_info *entry = rt6_ex->rt6i;
1737 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1738 * route), the metrics of its rt->from have already
1741 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1742 rt6_mtu_change_route_allowed(idev, entry, mtu))
1743 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1749 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1751 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1752 struct in6_addr *gateway)
1754 struct rt6_exception_bucket *bucket;
1755 struct rt6_exception *rt6_ex;
1756 struct hlist_node *tmp;
1759 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1762 spin_lock_bh(&rt6_exception_lock);
1763 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1764 lockdep_is_held(&rt6_exception_lock));
1767 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1768 hlist_for_each_entry_safe(rt6_ex, tmp,
1769 &bucket->chain, hlist) {
1770 struct rt6_info *entry = rt6_ex->rt6i;
1772 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1773 RTF_CACHE_GATEWAY &&
1774 ipv6_addr_equal(gateway,
1775 &entry->rt6i_gateway)) {
1776 rt6_remove_exception(bucket, rt6_ex);
1783 spin_unlock_bh(&rt6_exception_lock);
1786 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1787 struct rt6_exception *rt6_ex,
1788 struct fib6_gc_args *gc_args,
1791 struct rt6_info *rt = rt6_ex->rt6i;
1793 /* we are pruning and obsoleting aged-out and non gateway exceptions
1794 * even if others have still references to them, so that on next
1795 * dst_check() such references can be dropped.
1796 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1797 * expired, independently from their aging, as per RFC 8201 section 4
1799 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1800 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1801 RT6_TRACE("aging clone %p\n", rt);
1802 rt6_remove_exception(bucket, rt6_ex);
1805 } else if (time_after(jiffies, rt->dst.expires)) {
1806 RT6_TRACE("purging expired route %p\n", rt);
1807 rt6_remove_exception(bucket, rt6_ex);
1811 if (rt->rt6i_flags & RTF_GATEWAY) {
1812 struct neighbour *neigh;
1813 __u8 neigh_flags = 0;
1815 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1817 neigh_flags = neigh->flags;
1819 if (!(neigh_flags & NTF_ROUTER)) {
1820 RT6_TRACE("purging route %p via non-router but gateway\n",
1822 rt6_remove_exception(bucket, rt6_ex);
1830 void rt6_age_exceptions(struct fib6_info *rt,
1831 struct fib6_gc_args *gc_args,
1834 struct rt6_exception_bucket *bucket;
1835 struct rt6_exception *rt6_ex;
1836 struct hlist_node *tmp;
1839 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1843 spin_lock(&rt6_exception_lock);
1844 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1845 lockdep_is_held(&rt6_exception_lock));
1848 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1849 hlist_for_each_entry_safe(rt6_ex, tmp,
1850 &bucket->chain, hlist) {
1851 rt6_age_examine_exception(bucket, rt6_ex,
1857 spin_unlock(&rt6_exception_lock);
1858 rcu_read_unlock_bh();
1861 /* must be called with rcu lock held */
1862 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1863 struct flowi6 *fl6, struct fib6_result *res, int strict)
1865 struct fib6_node *fn, *saved_fn;
1867 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1870 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1874 rt6_select(net, fn, oif, res, strict);
1875 if (res->f6i == net->ipv6.fib6_null_entry) {
1876 fn = fib6_backtrack(fn, &fl6->saddr);
1878 goto redo_rt6_select;
1879 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1880 /* also consider unreachable route */
1881 strict &= ~RT6_LOOKUP_F_REACHABLE;
1883 goto redo_rt6_select;
1887 trace_fib6_table_lookup(net, res, table, fl6);
1892 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1893 int oif, struct flowi6 *fl6,
1894 const struct sk_buff *skb, int flags)
1896 struct fib6_result res = {};
1897 struct rt6_info *rt;
1900 strict |= flags & RT6_LOOKUP_F_IFACE;
1901 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1902 if (net->ipv6.devconf_all->forwarding == 0)
1903 strict |= RT6_LOOKUP_F_REACHABLE;
1907 fib6_table_lookup(net, table, oif, fl6, &res, strict);
1908 if (res.f6i == net->ipv6.fib6_null_entry) {
1909 rt = net->ipv6.ip6_null_entry;
1915 fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1917 /*Search through exception table */
1918 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1920 if (ip6_hold_safe(net, &rt))
1921 dst_use_noref(&rt->dst, jiffies);
1925 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1926 !res.nh->fib_nh_gw_family)) {
1927 /* Create a RTF_CACHE clone which will not be
1928 * owned by the fib6 tree. It is for the special case where
1929 * the daddr in the skb during the neighbor look-up is different
1930 * from the fl6->daddr used to look-up route here.
1932 struct rt6_info *uncached_rt;
1934 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1939 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1940 * No need for another dst_hold()
1942 rt6_uncached_list_add(uncached_rt);
1943 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1945 uncached_rt = net->ipv6.ip6_null_entry;
1946 dst_hold(&uncached_rt->dst);
1951 /* Get a percpu copy */
1953 struct rt6_info *pcpu_rt;
1956 pcpu_rt = rt6_get_pcpu_route(&res);
1959 pcpu_rt = rt6_make_pcpu_route(net, &res);
1967 EXPORT_SYMBOL_GPL(ip6_pol_route);
1969 static struct rt6_info *ip6_pol_route_input(struct net *net,
1970 struct fib6_table *table,
1972 const struct sk_buff *skb,
1975 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1978 struct dst_entry *ip6_route_input_lookup(struct net *net,
1979 struct net_device *dev,
1981 const struct sk_buff *skb,
1984 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1985 flags |= RT6_LOOKUP_F_IFACE;
1987 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1989 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1991 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1992 struct flow_keys *keys,
1993 struct flow_keys *flkeys)
1995 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1996 const struct ipv6hdr *key_iph = outer_iph;
1997 struct flow_keys *_flkeys = flkeys;
1998 const struct ipv6hdr *inner_iph;
1999 const struct icmp6hdr *icmph;
2000 struct ipv6hdr _inner_iph;
2001 struct icmp6hdr _icmph;
2003 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2006 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2007 sizeof(_icmph), &_icmph);
2011 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2012 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2013 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2014 icmph->icmp6_type != ICMPV6_PARAMPROB)
2017 inner_iph = skb_header_pointer(skb,
2018 skb_transport_offset(skb) + sizeof(*icmph),
2019 sizeof(_inner_iph), &_inner_iph);
2023 key_iph = inner_iph;
2027 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2028 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2029 keys->tags.flow_label = _flkeys->tags.flow_label;
2030 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2032 keys->addrs.v6addrs.src = key_iph->saddr;
2033 keys->addrs.v6addrs.dst = key_iph->daddr;
2034 keys->tags.flow_label = ip6_flowlabel(key_iph);
2035 keys->basic.ip_proto = key_iph->nexthdr;
2039 /* if skb is set it will be used and fl6 can be NULL */
2040 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2041 const struct sk_buff *skb, struct flow_keys *flkeys)
2043 struct flow_keys hash_keys;
2046 switch (ip6_multipath_hash_policy(net)) {
2048 memset(&hash_keys, 0, sizeof(hash_keys));
2049 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2051 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2053 hash_keys.addrs.v6addrs.src = fl6->saddr;
2054 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2055 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2056 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2061 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062 struct flow_keys keys;
2064 /* short-circuit if we already have L4 hash present */
2066 return skb_get_hash_raw(skb) >> 1;
2068 memset(&hash_keys, 0, sizeof(hash_keys));
2071 skb_flow_dissect_flow_keys(skb, &keys, flag);
2074 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2075 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2076 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2077 hash_keys.ports.src = flkeys->ports.src;
2078 hash_keys.ports.dst = flkeys->ports.dst;
2079 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081 memset(&hash_keys, 0, sizeof(hash_keys));
2082 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2083 hash_keys.addrs.v6addrs.src = fl6->saddr;
2084 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2085 hash_keys.ports.src = fl6->fl6_sport;
2086 hash_keys.ports.dst = fl6->fl6_dport;
2087 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2091 mhash = flow_hash_from_keys(&hash_keys);
2096 void ip6_route_input(struct sk_buff *skb)
2098 const struct ipv6hdr *iph = ipv6_hdr(skb);
2099 struct net *net = dev_net(skb->dev);
2100 int flags = RT6_LOOKUP_F_HAS_SADDR;
2101 struct ip_tunnel_info *tun_info;
2102 struct flowi6 fl6 = {
2103 .flowi6_iif = skb->dev->ifindex,
2104 .daddr = iph->daddr,
2105 .saddr = iph->saddr,
2106 .flowlabel = ip6_flowinfo(iph),
2107 .flowi6_mark = skb->mark,
2108 .flowi6_proto = iph->nexthdr,
2110 struct flow_keys *flkeys = NULL, _flkeys;
2112 tun_info = skb_tunnel_info(skb);
2113 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2114 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2116 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2119 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2120 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2123 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2126 static struct rt6_info *ip6_pol_route_output(struct net *net,
2127 struct fib6_table *table,
2129 const struct sk_buff *skb,
2132 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2135 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2136 struct flowi6 *fl6, int flags)
2140 if (ipv6_addr_type(&fl6->daddr) &
2141 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2142 struct dst_entry *dst;
2144 dst = l3mdev_link_scope_lookup(net, fl6);
2149 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2151 any_src = ipv6_addr_any(&fl6->saddr);
2152 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2153 (fl6->flowi6_oif && any_src))
2154 flags |= RT6_LOOKUP_F_IFACE;
2157 flags |= RT6_LOOKUP_F_HAS_SADDR;
2159 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2161 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2163 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2167 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2168 struct net_device *loopback_dev = net->loopback_dev;
2169 struct dst_entry *new = NULL;
2171 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2172 DST_OBSOLETE_DEAD, 0);
2175 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2179 new->input = dst_discard;
2180 new->output = dst_discard_out;
2182 dst_copy_metrics(new, &ort->dst);
2184 rt->rt6i_idev = in6_dev_get(loopback_dev);
2185 rt->rt6i_gateway = ort->rt6i_gateway;
2186 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2188 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2189 #ifdef CONFIG_IPV6_SUBTREES
2190 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2194 dst_release(dst_orig);
2195 return new ? new : ERR_PTR(-ENOMEM);
2199 * Destination cache support functions
2202 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2206 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2209 if (fib6_check_expired(f6i))
2215 static struct dst_entry *rt6_check(struct rt6_info *rt,
2216 struct fib6_info *from,
2221 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2222 rt_cookie != cookie)
2225 if (rt6_check_expired(rt))
2231 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2232 struct fib6_info *from,
2235 if (!__rt6_check_expired(rt) &&
2236 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2237 fib6_check(from, cookie))
2243 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2245 struct dst_entry *dst_ret;
2246 struct fib6_info *from;
2247 struct rt6_info *rt;
2249 rt = container_of(dst, struct rt6_info, dst);
2253 /* All IPV6 dsts are created with ->obsolete set to the value
2254 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2255 * into this function always.
2258 from = rcu_dereference(rt->from);
2260 if (from && (rt->rt6i_flags & RTF_PCPU ||
2261 unlikely(!list_empty(&rt->rt6i_uncached))))
2262 dst_ret = rt6_dst_from_check(rt, from, cookie);
2264 dst_ret = rt6_check(rt, from, cookie);
2271 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2273 struct rt6_info *rt = (struct rt6_info *) dst;
2276 if (rt->rt6i_flags & RTF_CACHE) {
2278 if (rt6_check_expired(rt)) {
2279 rt6_remove_exception_rt(rt);
2291 static void ip6_link_failure(struct sk_buff *skb)
2293 struct rt6_info *rt;
2295 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2297 rt = (struct rt6_info *) skb_dst(skb);
2300 if (rt->rt6i_flags & RTF_CACHE) {
2301 rt6_remove_exception_rt(rt);
2303 struct fib6_info *from;
2304 struct fib6_node *fn;
2306 from = rcu_dereference(rt->from);
2308 fn = rcu_dereference(from->fib6_node);
2309 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2317 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2319 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2320 struct fib6_info *from;
2323 from = rcu_dereference(rt0->from);
2325 rt0->dst.expires = from->expires;
2329 dst_set_expires(&rt0->dst, timeout);
2330 rt0->rt6i_flags |= RTF_EXPIRES;
2333 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2335 struct net *net = dev_net(rt->dst.dev);
2337 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2338 rt->rt6i_flags |= RTF_MODIFIED;
2339 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2342 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2344 return !(rt->rt6i_flags & RTF_CACHE) &&
2345 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2348 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2349 const struct ipv6hdr *iph, u32 mtu)
2351 const struct in6_addr *daddr, *saddr;
2352 struct rt6_info *rt6 = (struct rt6_info *)dst;
2354 if (dst_metric_locked(dst, RTAX_MTU))
2358 daddr = &iph->daddr;
2359 saddr = &iph->saddr;
2361 daddr = &sk->sk_v6_daddr;
2362 saddr = &inet6_sk(sk)->saddr;
2367 dst_confirm_neigh(dst, daddr);
2368 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2369 if (mtu >= dst_mtu(dst))
2372 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2373 rt6_do_update_pmtu(rt6, mtu);
2374 /* update rt6_ex->stamp for cache */
2375 if (rt6->rt6i_flags & RTF_CACHE)
2376 rt6_update_exception_stamp_rt(rt6);
2378 struct fib6_result res = {};
2379 struct rt6_info *nrt6;
2382 res.f6i = rcu_dereference(rt6->from);
2387 res.nh = &res.f6i->fib6_nh;
2388 res.fib6_flags = res.f6i->fib6_flags;
2389 res.fib6_type = res.f6i->fib6_type;
2391 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2393 rt6_do_update_pmtu(nrt6, mtu);
2394 if (rt6_insert_exception(nrt6, &res))
2395 dst_release_immediate(&nrt6->dst);
2401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402 struct sk_buff *skb, u32 mtu)
2404 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2408 int oif, u32 mark, kuid_t uid)
2410 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2411 struct dst_entry *dst;
2412 struct flowi6 fl6 = {
2414 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2415 .daddr = iph->daddr,
2416 .saddr = iph->saddr,
2417 .flowlabel = ip6_flowinfo(iph),
2421 dst = ip6_route_output(net, NULL, &fl6);
2423 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2430 int oif = sk->sk_bound_dev_if;
2431 struct dst_entry *dst;
2433 if (!oif && skb->dev)
2434 oif = l3mdev_master_ifindex(skb->dev);
2436 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2438 dst = __sk_dst_get(sk);
2439 if (!dst || !dst->obsolete ||
2440 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2444 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2445 ip6_datagram_dst_update(sk, false);
2448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2450 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2451 const struct flowi6 *fl6)
2453 #ifdef CONFIG_IPV6_SUBTREES
2454 struct ipv6_pinfo *np = inet6_sk(sk);
2457 ip6_dst_store(sk, dst,
2458 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2459 &sk->sk_v6_daddr : NULL,
2460 #ifdef CONFIG_IPV6_SUBTREES
2461 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2467 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2469 const struct in6_addr *gw,
2470 struct rt6_info **ret)
2472 const struct fib6_nh *nh = res->nh;
2474 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2475 fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2478 /* rt_cache's gateway might be different from its 'parent'
2479 * in the case of an ip redirect.
2480 * So we keep searching in the exception table if the gateway
2483 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2484 struct rt6_info *rt_cache;
2486 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2488 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2497 /* Handle redirects */
2498 struct ip6rd_flowi {
2500 struct in6_addr gateway;
2503 static struct rt6_info *__ip6_route_redirect(struct net *net,
2504 struct fib6_table *table,
2506 const struct sk_buff *skb,
2509 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510 struct rt6_info *ret = NULL;
2511 struct fib6_result res = {};
2512 struct fib6_info *rt;
2513 struct fib6_node *fn;
2515 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2516 * this case we must match on the real ingress device, so reset it
2518 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2519 fl6->flowi6_oif = skb->dev->ifindex;
2521 /* Get the "current" route for this destination and
2522 * check if the redirect has come from appropriate router.
2524 * RFC 4861 specifies that redirects should only be
2525 * accepted if they come from the nexthop to the target.
2526 * Due to the way the routes are chosen, this notion
2527 * is a bit fuzzy and one might need to check all possible
2532 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2534 for_each_fib6_node_rt_rcu(fn) {
2536 res.nh = &rt->fib6_nh;
2538 if (fib6_check_expired(rt))
2540 if (rt->fib6_flags & RTF_REJECT)
2542 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2547 rt = net->ipv6.fib6_null_entry;
2548 else if (rt->fib6_flags & RTF_REJECT) {
2549 ret = net->ipv6.ip6_null_entry;
2553 if (rt == net->ipv6.fib6_null_entry) {
2554 fn = fib6_backtrack(fn, &fl6->saddr);
2560 res.nh = &rt->fib6_nh;
2563 ip6_hold_safe(net, &ret);
2565 res.fib6_flags = res.f6i->fib6_flags;
2566 res.fib6_type = res.f6i->fib6_type;
2567 ret = ip6_create_rt_rcu(&res);
2572 trace_fib6_table_lookup(net, &res, table, fl6);
2576 static struct dst_entry *ip6_route_redirect(struct net *net,
2577 const struct flowi6 *fl6,
2578 const struct sk_buff *skb,
2579 const struct in6_addr *gateway)
2581 int flags = RT6_LOOKUP_F_HAS_SADDR;
2582 struct ip6rd_flowi rdfl;
2585 rdfl.gateway = *gateway;
2587 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2588 flags, __ip6_route_redirect);
2591 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2594 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2595 struct dst_entry *dst;
2596 struct flowi6 fl6 = {
2597 .flowi6_iif = LOOPBACK_IFINDEX,
2599 .flowi6_mark = mark,
2600 .daddr = iph->daddr,
2601 .saddr = iph->saddr,
2602 .flowlabel = ip6_flowinfo(iph),
2606 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2607 rt6_do_redirect(dst, NULL, skb);
2610 EXPORT_SYMBOL_GPL(ip6_redirect);
2612 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2614 const struct ipv6hdr *iph = ipv6_hdr(skb);
2615 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2616 struct dst_entry *dst;
2617 struct flowi6 fl6 = {
2618 .flowi6_iif = LOOPBACK_IFINDEX,
2621 .saddr = iph->daddr,
2622 .flowi6_uid = sock_net_uid(net, NULL),
2625 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2626 rt6_do_redirect(dst, NULL, skb);
2630 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2632 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2635 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2637 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2639 struct net_device *dev = dst->dev;
2640 unsigned int mtu = dst_mtu(dst);
2641 struct net *net = dev_net(dev);
2643 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2645 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2646 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2649 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2650 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2651 * IPV6_MAXPLEN is also valid and means: "any MSS,
2652 * rely only on pmtu discovery"
2654 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2659 static unsigned int ip6_mtu(const struct dst_entry *dst)
2661 struct inet6_dev *idev;
2664 mtu = dst_metric_raw(dst, RTAX_MTU);
2671 idev = __in6_dev_get(dst->dev);
2673 mtu = idev->cnf.mtu6;
2677 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2679 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2683 * 1. mtu on route is locked - use it
2684 * 2. mtu from nexthop exception
2685 * 3. mtu from egress device
2687 * based on ip6_dst_mtu_forward and exception logic of
2688 * rt6_find_cached_rt; called with rcu_read_lock
2690 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2691 const struct in6_addr *daddr,
2692 const struct in6_addr *saddr)
2694 const struct fib6_nh *nh = res->nh;
2695 struct fib6_info *f6i = res->f6i;
2696 struct inet6_dev *idev;
2697 struct rt6_info *rt;
2700 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2701 mtu = f6i->fib6_pmtu;
2706 rt = rt6_find_cached_rt(res, daddr, saddr);
2708 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2710 struct net_device *dev = nh->fib_nh_dev;
2713 idev = __in6_dev_get(dev);
2714 if (idev && idev->cnf.mtu6 > mtu)
2715 mtu = idev->cnf.mtu6;
2718 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2720 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2723 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2726 struct dst_entry *dst;
2727 struct rt6_info *rt;
2728 struct inet6_dev *idev = in6_dev_get(dev);
2729 struct net *net = dev_net(dev);
2731 if (unlikely(!idev))
2732 return ERR_PTR(-ENODEV);
2734 rt = ip6_dst_alloc(net, dev, 0);
2735 if (unlikely(!rt)) {
2737 dst = ERR_PTR(-ENOMEM);
2741 rt->dst.flags |= DST_HOST;
2742 rt->dst.input = ip6_input;
2743 rt->dst.output = ip6_output;
2744 rt->rt6i_gateway = fl6->daddr;
2745 rt->rt6i_dst.addr = fl6->daddr;
2746 rt->rt6i_dst.plen = 128;
2747 rt->rt6i_idev = idev;
2748 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2750 /* Add this dst into uncached_list so that rt6_disable_ip() can
2751 * do proper release of the net_device
2753 rt6_uncached_list_add(rt);
2754 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2756 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2762 static int ip6_dst_gc(struct dst_ops *ops)
2764 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2765 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2766 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2767 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2768 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2769 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2772 entries = dst_entries_get_fast(ops);
2773 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2774 entries <= rt_max_size)
2777 net->ipv6.ip6_rt_gc_expire++;
2778 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2779 entries = dst_entries_get_slow(ops);
2780 if (entries < ops->gc_thresh)
2781 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2783 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2784 return entries > rt_max_size;
2787 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2788 struct fib6_config *cfg,
2789 const struct in6_addr *gw_addr,
2790 u32 tbid, int flags)
2792 struct flowi6 fl6 = {
2793 .flowi6_oif = cfg->fc_ifindex,
2795 .saddr = cfg->fc_prefsrc,
2797 struct fib6_table *table;
2798 struct rt6_info *rt;
2800 table = fib6_get_table(net, tbid);
2804 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2805 flags |= RT6_LOOKUP_F_HAS_SADDR;
2807 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2808 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2810 /* if table lookup failed, fall back to full lookup */
2811 if (rt == net->ipv6.ip6_null_entry) {
2819 static int ip6_route_check_nh_onlink(struct net *net,
2820 struct fib6_config *cfg,
2821 const struct net_device *dev,
2822 struct netlink_ext_ack *extack)
2824 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2825 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2826 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2827 struct fib6_info *from;
2828 struct rt6_info *grt;
2832 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2835 from = rcu_dereference(grt->from);
2836 if (!grt->dst.error &&
2837 /* ignore match if it is the default route */
2838 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2839 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2840 NL_SET_ERR_MSG(extack,
2841 "Nexthop has invalid gateway or device mismatch");
2852 static int ip6_route_check_nh(struct net *net,
2853 struct fib6_config *cfg,
2854 struct net_device **_dev,
2855 struct inet6_dev **idev)
2857 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2858 struct net_device *dev = _dev ? *_dev : NULL;
2859 struct rt6_info *grt = NULL;
2860 int err = -EHOSTUNREACH;
2862 if (cfg->fc_table) {
2863 int flags = RT6_LOOKUP_F_IFACE;
2865 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2866 cfg->fc_table, flags);
2868 if (grt->rt6i_flags & RTF_GATEWAY ||
2869 (dev && dev != grt->dst.dev)) {
2877 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2883 if (dev != grt->dst.dev) {
2888 *_dev = dev = grt->dst.dev;
2889 *idev = grt->rt6i_idev;
2891 in6_dev_hold(grt->rt6i_idev);
2894 if (!(grt->rt6i_flags & RTF_GATEWAY))
2903 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2904 struct net_device **_dev, struct inet6_dev **idev,
2905 struct netlink_ext_ack *extack)
2907 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2908 int gwa_type = ipv6_addr_type(gw_addr);
2909 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2910 const struct net_device *dev = *_dev;
2911 bool need_addr_check = !dev;
2914 /* if gw_addr is local we will fail to detect this in case
2915 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2916 * will return already-added prefix route via interface that
2917 * prefix route was assigned to, which might be non-loopback.
2920 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2921 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2925 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2926 /* IPv6 strictly inhibits using not link-local
2927 * addresses as nexthop address.
2928 * Otherwise, router will not able to send redirects.
2929 * It is very good, but in some (rare!) circumstances
2930 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2931 * some exceptions. --ANK
2932 * We allow IPv4-mapped nexthops to support RFC4798-type
2935 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2936 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2940 if (cfg->fc_flags & RTNH_F_ONLINK)
2941 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2943 err = ip6_route_check_nh(net, cfg, _dev, idev);
2949 /* reload in case device was changed */
2954 NL_SET_ERR_MSG(extack, "Egress device not specified");
2956 } else if (dev->flags & IFF_LOOPBACK) {
2957 NL_SET_ERR_MSG(extack,
2958 "Egress device can not be loopback device for this route");
2962 /* if we did not check gw_addr above, do so now that the
2963 * egress device has been resolved.
2965 if (need_addr_check &&
2966 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2967 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2976 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2978 if ((flags & RTF_REJECT) ||
2979 (dev && (dev->flags & IFF_LOOPBACK) &&
2980 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2981 !(flags & RTF_LOCAL)))
2987 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2988 struct fib6_config *cfg, gfp_t gfp_flags,
2989 struct netlink_ext_ack *extack)
2991 struct net_device *dev = NULL;
2992 struct inet6_dev *idev = NULL;
2996 fib6_nh->fib_nh_family = AF_INET6;
2999 if (cfg->fc_ifindex) {
3000 dev = dev_get_by_index(net, cfg->fc_ifindex);
3003 idev = in6_dev_get(dev);
3008 if (cfg->fc_flags & RTNH_F_ONLINK) {
3010 NL_SET_ERR_MSG(extack,
3011 "Nexthop device required for onlink");
3015 if (!(dev->flags & IFF_UP)) {
3016 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3021 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3024 fib6_nh->fib_nh_weight = 1;
3026 /* We cannot add true routes via loopback here,
3027 * they would result in kernel looping; promote them to reject routes
3029 addr_type = ipv6_addr_type(&cfg->fc_dst);
3030 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3031 /* hold loopback dev/idev if we haven't done so. */
3032 if (dev != net->loopback_dev) {
3037 dev = net->loopback_dev;
3039 idev = in6_dev_get(dev);
3048 if (cfg->fc_flags & RTF_GATEWAY) {
3049 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3053 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3054 fib6_nh->fib_nh_gw_family = AF_INET6;
3061 if (idev->cnf.disable_ipv6) {
3062 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3067 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3068 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3073 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3074 !netif_carrier_ok(dev))
3075 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3077 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3078 cfg->fc_encap_type, cfg, gfp_flags, extack);
3082 fib6_nh->fib_nh_dev = dev;
3083 fib6_nh->fib_nh_oif = dev->ifindex;
3090 lwtstate_put(fib6_nh->fib_nh_lws);
3091 fib6_nh->fib_nh_lws = NULL;
3099 void fib6_nh_release(struct fib6_nh *fib6_nh)
3101 fib_nh_common_release(&fib6_nh->nh_common);
3104 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3106 struct netlink_ext_ack *extack)
3108 struct net *net = cfg->fc_nlinfo.nl_net;
3109 struct fib6_info *rt = NULL;
3110 struct fib6_table *table;
3114 /* RTF_PCPU is an internal flag; can not be set by userspace */
3115 if (cfg->fc_flags & RTF_PCPU) {
3116 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3120 /* RTF_CACHE is an internal flag; can not be set by userspace */
3121 if (cfg->fc_flags & RTF_CACHE) {
3122 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3126 if (cfg->fc_type > RTN_MAX) {
3127 NL_SET_ERR_MSG(extack, "Invalid route type");
3131 if (cfg->fc_dst_len > 128) {
3132 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3135 if (cfg->fc_src_len > 128) {
3136 NL_SET_ERR_MSG(extack, "Invalid source address length");
3139 #ifndef CONFIG_IPV6_SUBTREES
3140 if (cfg->fc_src_len) {
3141 NL_SET_ERR_MSG(extack,
3142 "Specifying source address requires IPV6_SUBTREES to be enabled");
3148 if (cfg->fc_nlinfo.nlh &&
3149 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3150 table = fib6_get_table(net, cfg->fc_table);
3152 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3153 table = fib6_new_table(net, cfg->fc_table);
3156 table = fib6_new_table(net, cfg->fc_table);
3163 rt = fib6_info_alloc(gfp_flags);
3167 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3169 if (IS_ERR(rt->fib6_metrics)) {
3170 err = PTR_ERR(rt->fib6_metrics);
3171 /* Do not leave garbage there. */
3172 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3176 if (cfg->fc_flags & RTF_ADDRCONF)
3177 rt->dst_nocount = true;
3179 if (cfg->fc_flags & RTF_EXPIRES)
3180 fib6_set_expires(rt, jiffies +
3181 clock_t_to_jiffies(cfg->fc_expires));
3183 fib6_clean_expires(rt);
3185 if (cfg->fc_protocol == RTPROT_UNSPEC)
3186 cfg->fc_protocol = RTPROT_BOOT;
3187 rt->fib6_protocol = cfg->fc_protocol;
3189 rt->fib6_table = table;
3190 rt->fib6_metric = cfg->fc_metric;
3191 rt->fib6_type = cfg->fc_type;
3192 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3194 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3195 rt->fib6_dst.plen = cfg->fc_dst_len;
3196 if (rt->fib6_dst.plen == 128)
3197 rt->dst_host = true;
3199 #ifdef CONFIG_IPV6_SUBTREES
3200 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3201 rt->fib6_src.plen = cfg->fc_src_len;
3203 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3207 /* We cannot add true routes via loopback here,
3208 * they would result in kernel looping; promote them to reject routes
3210 addr_type = ipv6_addr_type(&cfg->fc_dst);
3211 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3212 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3214 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3215 struct net_device *dev = fib6_info_nh_dev(rt);
3217 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3218 NL_SET_ERR_MSG(extack, "Invalid source address");
3222 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3223 rt->fib6_prefsrc.plen = 128;
3225 rt->fib6_prefsrc.plen = 0;
3229 fib6_info_release(rt);
3230 return ERR_PTR(err);
3233 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3234 struct netlink_ext_ack *extack)
3236 struct fib6_info *rt;
3239 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3243 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3244 fib6_info_release(rt);
3249 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3251 struct net *net = info->nl_net;
3252 struct fib6_table *table;
3255 if (rt == net->ipv6.fib6_null_entry) {
3260 table = rt->fib6_table;
3261 spin_lock_bh(&table->tb6_lock);
3262 err = fib6_del(rt, info);
3263 spin_unlock_bh(&table->tb6_lock);
3266 fib6_info_release(rt);
3270 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3272 struct nl_info info = { .nl_net = net };
3274 return __ip6_del_rt(rt, &info);
3277 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3279 struct nl_info *info = &cfg->fc_nlinfo;
3280 struct net *net = info->nl_net;
3281 struct sk_buff *skb = NULL;
3282 struct fib6_table *table;
3285 if (rt == net->ipv6.fib6_null_entry)
3287 table = rt->fib6_table;
3288 spin_lock_bh(&table->tb6_lock);
3290 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3291 struct fib6_info *sibling, *next_sibling;
3293 /* prefer to send a single notification with all hops */
3294 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3296 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3298 if (rt6_fill_node(net, skb, rt, NULL,
3299 NULL, NULL, 0, RTM_DELROUTE,
3300 info->portid, seq, 0) < 0) {
3304 info->skip_notify = 1;
3307 list_for_each_entry_safe(sibling, next_sibling,
3310 err = fib6_del(sibling, info);
3316 err = fib6_del(rt, info);
3318 spin_unlock_bh(&table->tb6_lock);
3320 fib6_info_release(rt);
3323 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3324 info->nlh, gfp_any());
3329 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3333 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3336 if (cfg->fc_flags & RTF_GATEWAY &&
3337 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3340 rc = rt6_remove_exception_rt(rt);
3345 static int ip6_route_del(struct fib6_config *cfg,
3346 struct netlink_ext_ack *extack)
3348 struct rt6_info *rt_cache;
3349 struct fib6_table *table;
3350 struct fib6_info *rt;
3351 struct fib6_node *fn;
3354 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3356 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3362 fn = fib6_locate(&table->tb6_root,
3363 &cfg->fc_dst, cfg->fc_dst_len,
3364 &cfg->fc_src, cfg->fc_src_len,
3365 !(cfg->fc_flags & RTF_CACHE));
3368 for_each_fib6_node_rt_rcu(fn) {
3371 if (cfg->fc_flags & RTF_CACHE) {
3372 struct fib6_result res = {
3377 rt_cache = rt6_find_cached_rt(&res,
3381 rc = ip6_del_cached_rt(rt_cache, cfg);
3391 if (cfg->fc_ifindex &&
3393 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3395 if (cfg->fc_flags & RTF_GATEWAY &&
3396 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3398 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3400 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3402 if (!fib6_info_hold_safe(rt))
3406 /* if gateway was specified only delete the one hop */
3407 if (cfg->fc_flags & RTF_GATEWAY)
3408 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3410 return __ip6_del_rt_siblings(rt, cfg);
3418 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3420 struct netevent_redirect netevent;
3421 struct rt6_info *rt, *nrt = NULL;
3422 struct fib6_result res = {};
3423 struct ndisc_options ndopts;
3424 struct inet6_dev *in6_dev;
3425 struct neighbour *neigh;
3427 int optlen, on_link;
3430 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3431 optlen -= sizeof(*msg);
3434 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3438 msg = (struct rd_msg *)icmp6_hdr(skb);
3440 if (ipv6_addr_is_multicast(&msg->dest)) {
3441 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3446 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3448 } else if (ipv6_addr_type(&msg->target) !=
3449 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3450 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3454 in6_dev = __in6_dev_get(skb->dev);
3457 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3461 * The IP source address of the Redirect MUST be the same as the current
3462 * first-hop router for the specified ICMP Destination Address.
3465 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3466 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3471 if (ndopts.nd_opts_tgt_lladdr) {
3472 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3475 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3480 rt = (struct rt6_info *) dst;
3481 if (rt->rt6i_flags & RTF_REJECT) {
3482 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3486 /* Redirect received -> path was valid.
3487 * Look, redirects are sent only in response to data packets,
3488 * so that this nexthop apparently is reachable. --ANK
3490 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3492 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3497 * We have finally decided to accept it.
3500 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3501 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3502 NEIGH_UPDATE_F_OVERRIDE|
3503 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3504 NEIGH_UPDATE_F_ISROUTER)),
3505 NDISC_REDIRECT, &ndopts);
3508 res.f6i = rcu_dereference(rt->from);
3512 res.nh = &res.f6i->fib6_nh;
3513 res.fib6_flags = res.f6i->fib6_flags;
3514 res.fib6_type = res.f6i->fib6_type;
3515 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3519 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3521 nrt->rt6i_flags &= ~RTF_GATEWAY;
3523 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3525 /* rt6_insert_exception() will take care of duplicated exceptions */
3526 if (rt6_insert_exception(nrt, &res)) {
3527 dst_release_immediate(&nrt->dst);
3531 netevent.old = &rt->dst;
3532 netevent.new = &nrt->dst;
3533 netevent.daddr = &msg->dest;
3534 netevent.neigh = neigh;
3535 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3539 neigh_release(neigh);
3542 #ifdef CONFIG_IPV6_ROUTE_INFO
3543 static struct fib6_info *rt6_get_route_info(struct net *net,
3544 const struct in6_addr *prefix, int prefixlen,
3545 const struct in6_addr *gwaddr,
3546 struct net_device *dev)
3548 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3549 int ifindex = dev->ifindex;
3550 struct fib6_node *fn;
3551 struct fib6_info *rt = NULL;
3552 struct fib6_table *table;
3554 table = fib6_get_table(net, tb_id);
3559 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3563 for_each_fib6_node_rt_rcu(fn) {
3564 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3566 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3567 !rt->fib6_nh.fib_nh_gw_family)
3569 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3571 if (!fib6_info_hold_safe(rt))
3580 static struct fib6_info *rt6_add_route_info(struct net *net,
3581 const struct in6_addr *prefix, int prefixlen,
3582 const struct in6_addr *gwaddr,
3583 struct net_device *dev,
3586 struct fib6_config cfg = {
3587 .fc_metric = IP6_RT_PRIO_USER,
3588 .fc_ifindex = dev->ifindex,
3589 .fc_dst_len = prefixlen,
3590 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3591 RTF_UP | RTF_PREF(pref),
3592 .fc_protocol = RTPROT_RA,
3593 .fc_type = RTN_UNICAST,
3594 .fc_nlinfo.portid = 0,
3595 .fc_nlinfo.nlh = NULL,
3596 .fc_nlinfo.nl_net = net,
3599 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3600 cfg.fc_dst = *prefix;
3601 cfg.fc_gateway = *gwaddr;
3603 /* We should treat it as a default route if prefix length is 0. */
3605 cfg.fc_flags |= RTF_DEFAULT;
3607 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3609 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3613 struct fib6_info *rt6_get_dflt_router(struct net *net,
3614 const struct in6_addr *addr,
3615 struct net_device *dev)
3617 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3618 struct fib6_info *rt;
3619 struct fib6_table *table;
3621 table = fib6_get_table(net, tb_id);
3626 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3627 struct fib6_nh *nh = &rt->fib6_nh;
3629 if (dev == nh->fib_nh_dev &&
3630 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3631 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3634 if (rt && !fib6_info_hold_safe(rt))
3640 struct fib6_info *rt6_add_dflt_router(struct net *net,
3641 const struct in6_addr *gwaddr,
3642 struct net_device *dev,
3645 struct fib6_config cfg = {
3646 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3647 .fc_metric = IP6_RT_PRIO_USER,
3648 .fc_ifindex = dev->ifindex,
3649 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3650 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3651 .fc_protocol = RTPROT_RA,
3652 .fc_type = RTN_UNICAST,
3653 .fc_nlinfo.portid = 0,
3654 .fc_nlinfo.nlh = NULL,
3655 .fc_nlinfo.nl_net = net,
3658 cfg.fc_gateway = *gwaddr;
3660 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3661 struct fib6_table *table;
3663 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3665 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3668 return rt6_get_dflt_router(net, gwaddr, dev);
3671 static void __rt6_purge_dflt_routers(struct net *net,
3672 struct fib6_table *table)
3674 struct fib6_info *rt;
3678 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3679 struct net_device *dev = fib6_info_nh_dev(rt);
3680 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3682 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3683 (!idev || idev->cnf.accept_ra != 2) &&
3684 fib6_info_hold_safe(rt)) {
3686 ip6_del_rt(net, rt);
3692 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3695 void rt6_purge_dflt_routers(struct net *net)
3697 struct fib6_table *table;
3698 struct hlist_head *head;
3703 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3704 head = &net->ipv6.fib_table_hash[h];
3705 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3706 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3707 __rt6_purge_dflt_routers(net, table);
3714 static void rtmsg_to_fib6_config(struct net *net,
3715 struct in6_rtmsg *rtmsg,
3716 struct fib6_config *cfg)
3718 *cfg = (struct fib6_config){
3719 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3721 .fc_ifindex = rtmsg->rtmsg_ifindex,
3722 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3723 .fc_expires = rtmsg->rtmsg_info,
3724 .fc_dst_len = rtmsg->rtmsg_dst_len,
3725 .fc_src_len = rtmsg->rtmsg_src_len,
3726 .fc_flags = rtmsg->rtmsg_flags,
3727 .fc_type = rtmsg->rtmsg_type,
3729 .fc_nlinfo.nl_net = net,
3731 .fc_dst = rtmsg->rtmsg_dst,
3732 .fc_src = rtmsg->rtmsg_src,
3733 .fc_gateway = rtmsg->rtmsg_gateway,
3737 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3739 struct fib6_config cfg;
3740 struct in6_rtmsg rtmsg;
3744 case SIOCADDRT: /* Add a route */
3745 case SIOCDELRT: /* Delete a route */
3746 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3748 err = copy_from_user(&rtmsg, arg,
3749 sizeof(struct in6_rtmsg));
3753 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3758 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3761 err = ip6_route_del(&cfg, NULL);
3775 * Drop the packet on the floor
3778 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3780 struct dst_entry *dst = skb_dst(skb);
3781 struct net *net = dev_net(dst->dev);
3782 struct inet6_dev *idev;
3785 if (netif_is_l3_master(skb->dev) &&
3786 dst->dev == net->loopback_dev)
3787 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3789 idev = ip6_dst_idev(dst);
3791 switch (ipstats_mib_noroutes) {
3792 case IPSTATS_MIB_INNOROUTES:
3793 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3794 if (type == IPV6_ADDR_ANY) {
3795 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3799 case IPSTATS_MIB_OUTNOROUTES:
3800 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3804 /* Start over by dropping the dst for l3mdev case */
3805 if (netif_is_l3_master(skb->dev))
3808 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3813 static int ip6_pkt_discard(struct sk_buff *skb)
3815 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3818 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3820 skb->dev = skb_dst(skb)->dev;
3821 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3824 static int ip6_pkt_prohibit(struct sk_buff *skb)
3826 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3829 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3831 skb->dev = skb_dst(skb)->dev;
3832 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3836 * Allocate a dst for local (unicast / anycast) address.
3839 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3840 struct inet6_dev *idev,
3841 const struct in6_addr *addr,
3842 bool anycast, gfp_t gfp_flags)
3844 struct fib6_config cfg = {
3845 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3846 .fc_ifindex = idev->dev->ifindex,
3847 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3850 .fc_protocol = RTPROT_KERNEL,
3851 .fc_nlinfo.nl_net = net,
3852 .fc_ignore_dev_down = true,
3856 cfg.fc_type = RTN_ANYCAST;
3857 cfg.fc_flags |= RTF_ANYCAST;
3859 cfg.fc_type = RTN_LOCAL;
3860 cfg.fc_flags |= RTF_LOCAL;
3863 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3866 /* remove deleted ip from prefsrc entries */
3867 struct arg_dev_net_ip {
3868 struct net_device *dev;
3870 struct in6_addr *addr;
3873 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3875 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3876 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3877 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3879 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3880 rt != net->ipv6.fib6_null_entry &&
3881 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3882 spin_lock_bh(&rt6_exception_lock);
3883 /* remove prefsrc entry */
3884 rt->fib6_prefsrc.plen = 0;
3885 spin_unlock_bh(&rt6_exception_lock);
3890 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3892 struct net *net = dev_net(ifp->idev->dev);
3893 struct arg_dev_net_ip adni = {
3894 .dev = ifp->idev->dev,
3898 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3901 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3903 /* Remove routers and update dst entries when gateway turn into host. */
3904 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3906 struct in6_addr *gateway = (struct in6_addr *)arg;
3908 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3909 rt->fib6_nh.fib_nh_gw_family &&
3910 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3914 /* Further clean up cached routes in exception table.
3915 * This is needed because cached route may have a different
3916 * gateway than its 'parent' in the case of an ip redirect.
3918 rt6_exceptions_clean_tohost(rt, gateway);
3923 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3925 fib6_clean_all(net, fib6_clean_tohost, gateway);
3928 struct arg_netdev_event {
3929 const struct net_device *dev;
3931 unsigned char nh_flags;
3932 unsigned long event;
3936 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3938 struct fib6_info *iter;
3939 struct fib6_node *fn;
3941 fn = rcu_dereference_protected(rt->fib6_node,
3942 lockdep_is_held(&rt->fib6_table->tb6_lock));
3943 iter = rcu_dereference_protected(fn->leaf,
3944 lockdep_is_held(&rt->fib6_table->tb6_lock));
3946 if (iter->fib6_metric == rt->fib6_metric &&
3947 rt6_qualify_for_ecmp(iter))
3949 iter = rcu_dereference_protected(iter->fib6_next,
3950 lockdep_is_held(&rt->fib6_table->tb6_lock));
3956 static bool rt6_is_dead(const struct fib6_info *rt)
3958 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3959 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3960 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3966 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3968 struct fib6_info *iter;
3971 if (!rt6_is_dead(rt))
3972 total += rt->fib6_nh.fib_nh_weight;
3974 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3975 if (!rt6_is_dead(iter))
3976 total += iter->fib6_nh.fib_nh_weight;
3982 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3984 int upper_bound = -1;
3986 if (!rt6_is_dead(rt)) {
3987 *weight += rt->fib6_nh.fib_nh_weight;
3988 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3991 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3994 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3996 struct fib6_info *iter;
3999 rt6_upper_bound_set(rt, &weight, total);
4001 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4002 rt6_upper_bound_set(iter, &weight, total);
4005 void rt6_multipath_rebalance(struct fib6_info *rt)
4007 struct fib6_info *first;
4010 /* In case the entire multipath route was marked for flushing,
4011 * then there is no need to rebalance upon the removal of every
4014 if (!rt->fib6_nsiblings || rt->should_flush)
4017 /* During lookup routes are evaluated in order, so we need to
4018 * make sure upper bounds are assigned from the first sibling
4021 first = rt6_multipath_first_sibling(rt);
4022 if (WARN_ON_ONCE(!first))
4025 total = rt6_multipath_total_weight(first);
4026 rt6_multipath_upper_bound_set(first, total);
4029 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4031 const struct arg_netdev_event *arg = p_arg;
4032 struct net *net = dev_net(arg->dev);
4034 if (rt != net->ipv6.fib6_null_entry &&
4035 rt->fib6_nh.fib_nh_dev == arg->dev) {
4036 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4037 fib6_update_sernum_upto_root(net, rt);
4038 rt6_multipath_rebalance(rt);
4044 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4046 struct arg_netdev_event arg = {
4049 .nh_flags = nh_flags,
4053 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4054 arg.nh_flags |= RTNH_F_LINKDOWN;
4056 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4059 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4060 const struct net_device *dev)
4062 struct fib6_info *iter;
4064 if (rt->fib6_nh.fib_nh_dev == dev)
4066 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4067 if (iter->fib6_nh.fib_nh_dev == dev)
4073 static void rt6_multipath_flush(struct fib6_info *rt)
4075 struct fib6_info *iter;
4077 rt->should_flush = 1;
4078 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4079 iter->should_flush = 1;
4082 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4083 const struct net_device *down_dev)
4085 struct fib6_info *iter;
4086 unsigned int dead = 0;
4088 if (rt->fib6_nh.fib_nh_dev == down_dev ||
4089 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4091 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4093 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4099 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4100 const struct net_device *dev,
4101 unsigned char nh_flags)
4103 struct fib6_info *iter;
4105 if (rt->fib6_nh.fib_nh_dev == dev)
4106 rt->fib6_nh.fib_nh_flags |= nh_flags;
4107 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4108 if (iter->fib6_nh.fib_nh_dev == dev)
4109 iter->fib6_nh.fib_nh_flags |= nh_flags;
4112 /* called with write lock held for table with rt */
4113 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4115 const struct arg_netdev_event *arg = p_arg;
4116 const struct net_device *dev = arg->dev;
4117 struct net *net = dev_net(dev);
4119 if (rt == net->ipv6.fib6_null_entry)
4122 switch (arg->event) {
4123 case NETDEV_UNREGISTER:
4124 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4126 if (rt->should_flush)
4128 if (!rt->fib6_nsiblings)
4129 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4130 if (rt6_multipath_uses_dev(rt, dev)) {
4133 count = rt6_multipath_dead_count(rt, dev);
4134 if (rt->fib6_nsiblings + 1 == count) {
4135 rt6_multipath_flush(rt);
4138 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4140 fib6_update_sernum(net, rt);
4141 rt6_multipath_rebalance(rt);
4145 if (rt->fib6_nh.fib_nh_dev != dev ||
4146 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4148 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4149 rt6_multipath_rebalance(rt);
4156 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4158 struct arg_netdev_event arg = {
4164 struct net *net = dev_net(dev);
4166 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4167 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4169 fib6_clean_all(net, fib6_ifdown, &arg);
4172 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4174 rt6_sync_down_dev(dev, event);
4175 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4176 neigh_ifdown(&nd_tbl, dev);
4179 struct rt6_mtu_change_arg {
4180 struct net_device *dev;
4184 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4186 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4187 struct inet6_dev *idev;
4189 /* In IPv6 pmtu discovery is not optional,
4190 so that RTAX_MTU lock cannot disable it.
4191 We still use this lock to block changes
4192 caused by addrconf/ndisc.
4195 idev = __in6_dev_get(arg->dev);
4199 /* For administrative MTU increase, there is no way to discover
4200 IPv6 PMTU increase, so PMTU increase should be updated here.
4201 Since RFC 1981 doesn't include administrative MTU increase
4202 update PMTU increase is a MUST. (i.e. jumbo frame)
4204 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4205 !fib6_metric_locked(rt, RTAX_MTU)) {
4206 u32 mtu = rt->fib6_pmtu;
4208 if (mtu >= arg->mtu ||
4209 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4210 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4212 spin_lock_bh(&rt6_exception_lock);
4213 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4214 spin_unlock_bh(&rt6_exception_lock);
4219 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4221 struct rt6_mtu_change_arg arg = {
4226 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4229 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4230 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4231 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4232 [RTA_OIF] = { .type = NLA_U32 },
4233 [RTA_IIF] = { .type = NLA_U32 },
4234 [RTA_PRIORITY] = { .type = NLA_U32 },
4235 [RTA_METRICS] = { .type = NLA_NESTED },
4236 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4237 [RTA_PREF] = { .type = NLA_U8 },
4238 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4239 [RTA_ENCAP] = { .type = NLA_NESTED },
4240 [RTA_EXPIRES] = { .type = NLA_U32 },
4241 [RTA_UID] = { .type = NLA_U32 },
4242 [RTA_MARK] = { .type = NLA_U32 },
4243 [RTA_TABLE] = { .type = NLA_U32 },
4244 [RTA_IP_PROTO] = { .type = NLA_U8 },
4245 [RTA_SPORT] = { .type = NLA_U16 },
4246 [RTA_DPORT] = { .type = NLA_U16 },
4249 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4250 struct fib6_config *cfg,
4251 struct netlink_ext_ack *extack)
4254 struct nlattr *tb[RTA_MAX+1];
4258 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4259 rtm_ipv6_policy, extack);
4264 rtm = nlmsg_data(nlh);
4266 *cfg = (struct fib6_config){
4267 .fc_table = rtm->rtm_table,
4268 .fc_dst_len = rtm->rtm_dst_len,
4269 .fc_src_len = rtm->rtm_src_len,
4271 .fc_protocol = rtm->rtm_protocol,
4272 .fc_type = rtm->rtm_type,
4274 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4275 .fc_nlinfo.nlh = nlh,
4276 .fc_nlinfo.nl_net = sock_net(skb->sk),
4279 if (rtm->rtm_type == RTN_UNREACHABLE ||
4280 rtm->rtm_type == RTN_BLACKHOLE ||
4281 rtm->rtm_type == RTN_PROHIBIT ||
4282 rtm->rtm_type == RTN_THROW)
4283 cfg->fc_flags |= RTF_REJECT;
4285 if (rtm->rtm_type == RTN_LOCAL)
4286 cfg->fc_flags |= RTF_LOCAL;
4288 if (rtm->rtm_flags & RTM_F_CLONED)
4289 cfg->fc_flags |= RTF_CACHE;
4291 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4293 if (tb[RTA_GATEWAY]) {
4294 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4295 cfg->fc_flags |= RTF_GATEWAY;
4298 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4303 int plen = (rtm->rtm_dst_len + 7) >> 3;
4305 if (nla_len(tb[RTA_DST]) < plen)
4308 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4312 int plen = (rtm->rtm_src_len + 7) >> 3;
4314 if (nla_len(tb[RTA_SRC]) < plen)
4317 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4320 if (tb[RTA_PREFSRC])
4321 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4324 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4326 if (tb[RTA_PRIORITY])
4327 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4329 if (tb[RTA_METRICS]) {
4330 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4331 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4335 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4337 if (tb[RTA_MULTIPATH]) {
4338 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4339 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4341 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4342 cfg->fc_mp_len, extack);
4348 pref = nla_get_u8(tb[RTA_PREF]);
4349 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4350 pref != ICMPV6_ROUTER_PREF_HIGH)
4351 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4352 cfg->fc_flags |= RTF_PREF(pref);
4356 cfg->fc_encap = tb[RTA_ENCAP];
4358 if (tb[RTA_ENCAP_TYPE]) {
4359 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4361 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4366 if (tb[RTA_EXPIRES]) {
4367 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4369 if (addrconf_finite_timeout(timeout)) {
4370 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4371 cfg->fc_flags |= RTF_EXPIRES;
4381 struct fib6_info *fib6_info;
4382 struct fib6_config r_cfg;
4383 struct list_head next;
4386 static int ip6_route_info_append(struct net *net,
4387 struct list_head *rt6_nh_list,
4388 struct fib6_info *rt,
4389 struct fib6_config *r_cfg)
4394 list_for_each_entry(nh, rt6_nh_list, next) {
4395 /* check if fib6_info already exists */
4396 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4400 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4404 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4405 list_add_tail(&nh->next, rt6_nh_list);
4410 static void ip6_route_mpath_notify(struct fib6_info *rt,
4411 struct fib6_info *rt_last,
4412 struct nl_info *info,
4415 /* if this is an APPEND route, then rt points to the first route
4416 * inserted and rt_last points to last route inserted. Userspace
4417 * wants a consistent dump of the route which starts at the first
4418 * nexthop. Since sibling routes are always added at the end of
4419 * the list, find the first sibling of the last route appended
4421 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4422 rt = list_first_entry(&rt_last->fib6_siblings,
4428 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4431 static int ip6_route_multipath_add(struct fib6_config *cfg,
4432 struct netlink_ext_ack *extack)
4434 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4435 struct nl_info *info = &cfg->fc_nlinfo;
4436 struct fib6_config r_cfg;
4437 struct rtnexthop *rtnh;
4438 struct fib6_info *rt;
4439 struct rt6_nh *err_nh;
4440 struct rt6_nh *nh, *nh_safe;
4446 int replace = (cfg->fc_nlinfo.nlh &&
4447 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4448 LIST_HEAD(rt6_nh_list);
4450 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4451 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4452 nlflags |= NLM_F_APPEND;
4454 remaining = cfg->fc_mp_len;
4455 rtnh = (struct rtnexthop *)cfg->fc_mp;
4457 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4458 * fib6_info structs per nexthop
4460 while (rtnh_ok(rtnh, remaining)) {
4461 memcpy(&r_cfg, cfg, sizeof(*cfg));
4462 if (rtnh->rtnh_ifindex)
4463 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4465 attrlen = rtnh_attrlen(rtnh);
4467 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4469 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4471 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4472 r_cfg.fc_flags |= RTF_GATEWAY;
4474 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4475 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4477 r_cfg.fc_encap_type = nla_get_u16(nla);
4480 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4481 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4487 if (!rt6_qualify_for_ecmp(rt)) {
4489 NL_SET_ERR_MSG(extack,
4490 "Device only routes can not be added for IPv6 using the multipath API.");
4491 fib6_info_release(rt);
4495 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4497 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4500 fib6_info_release(rt);
4504 rtnh = rtnh_next(rtnh, &remaining);
4507 /* for add and replace send one notification with all nexthops.
4508 * Skip the notification in fib6_add_rt2node and send one with
4509 * the full route when done
4511 info->skip_notify = 1;
4514 list_for_each_entry(nh, &rt6_nh_list, next) {
4515 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4516 fib6_info_release(nh->fib6_info);
4519 /* save reference to last route successfully inserted */
4520 rt_last = nh->fib6_info;
4522 /* save reference to first route for notification */
4524 rt_notif = nh->fib6_info;
4527 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4528 nh->fib6_info = NULL;
4531 NL_SET_ERR_MSG_MOD(extack,
4532 "multipath route replace failed (check consistency of installed routes)");
4537 /* Because each route is added like a single route we remove
4538 * these flags after the first nexthop: if there is a collision,
4539 * we have already failed to add the first nexthop:
4540 * fib6_add_rt2node() has rejected it; when replacing, old
4541 * nexthops have been replaced by first new, the rest should
4544 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4549 /* success ... tell user about new route */
4550 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4554 /* send notification for routes that were added so that
4555 * the delete notifications sent by ip6_route_del are
4559 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4561 /* Delete routes that were already added */
4562 list_for_each_entry(nh, &rt6_nh_list, next) {
4565 ip6_route_del(&nh->r_cfg, extack);
4569 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4571 fib6_info_release(nh->fib6_info);
4572 list_del(&nh->next);
4579 static int ip6_route_multipath_del(struct fib6_config *cfg,
4580 struct netlink_ext_ack *extack)
4582 struct fib6_config r_cfg;
4583 struct rtnexthop *rtnh;
4586 int err = 1, last_err = 0;
4588 remaining = cfg->fc_mp_len;
4589 rtnh = (struct rtnexthop *)cfg->fc_mp;
4591 /* Parse a Multipath Entry */
4592 while (rtnh_ok(rtnh, remaining)) {
4593 memcpy(&r_cfg, cfg, sizeof(*cfg));
4594 if (rtnh->rtnh_ifindex)
4595 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4597 attrlen = rtnh_attrlen(rtnh);
4599 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4601 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4603 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4604 r_cfg.fc_flags |= RTF_GATEWAY;
4607 err = ip6_route_del(&r_cfg, extack);
4611 rtnh = rtnh_next(rtnh, &remaining);
4617 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4618 struct netlink_ext_ack *extack)
4620 struct fib6_config cfg;
4623 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4628 return ip6_route_multipath_del(&cfg, extack);
4630 cfg.fc_delete_all_nh = 1;
4631 return ip6_route_del(&cfg, extack);
4635 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4636 struct netlink_ext_ack *extack)
4638 struct fib6_config cfg;
4641 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4645 if (cfg.fc_metric == 0)
4646 cfg.fc_metric = IP6_RT_PRIO_USER;
4649 return ip6_route_multipath_add(&cfg, extack);
4651 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4654 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4656 int nexthop_len = 0;
4658 if (rt->fib6_nsiblings) {
4659 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4660 + NLA_ALIGN(sizeof(struct rtnexthop))
4661 + nla_total_size(16) /* RTA_GATEWAY */
4662 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4664 nexthop_len *= rt->fib6_nsiblings;
4667 return NLMSG_ALIGN(sizeof(struct rtmsg))
4668 + nla_total_size(16) /* RTA_SRC */
4669 + nla_total_size(16) /* RTA_DST */
4670 + nla_total_size(16) /* RTA_GATEWAY */
4671 + nla_total_size(16) /* RTA_PREFSRC */
4672 + nla_total_size(4) /* RTA_TABLE */
4673 + nla_total_size(4) /* RTA_IIF */
4674 + nla_total_size(4) /* RTA_OIF */
4675 + nla_total_size(4) /* RTA_PRIORITY */
4676 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4677 + nla_total_size(sizeof(struct rta_cacheinfo))
4678 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4679 + nla_total_size(1) /* RTA_PREF */
4680 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4684 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4685 struct fib6_info *rt, struct dst_entry *dst,
4686 struct in6_addr *dest, struct in6_addr *src,
4687 int iif, int type, u32 portid, u32 seq,
4690 struct rt6_info *rt6 = (struct rt6_info *)dst;
4691 struct rt6key *rt6_dst, *rt6_src;
4692 u32 *pmetrics, table, rt6_flags;
4693 struct nlmsghdr *nlh;
4697 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4702 rt6_dst = &rt6->rt6i_dst;
4703 rt6_src = &rt6->rt6i_src;
4704 rt6_flags = rt6->rt6i_flags;
4706 rt6_dst = &rt->fib6_dst;
4707 rt6_src = &rt->fib6_src;
4708 rt6_flags = rt->fib6_flags;
4711 rtm = nlmsg_data(nlh);
4712 rtm->rtm_family = AF_INET6;
4713 rtm->rtm_dst_len = rt6_dst->plen;
4714 rtm->rtm_src_len = rt6_src->plen;
4717 table = rt->fib6_table->tb6_id;
4719 table = RT6_TABLE_UNSPEC;
4720 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4721 if (nla_put_u32(skb, RTA_TABLE, table))
4722 goto nla_put_failure;
4724 rtm->rtm_type = rt->fib6_type;
4726 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4727 rtm->rtm_protocol = rt->fib6_protocol;
4729 if (rt6_flags & RTF_CACHE)
4730 rtm->rtm_flags |= RTM_F_CLONED;
4733 if (nla_put_in6_addr(skb, RTA_DST, dest))
4734 goto nla_put_failure;
4735 rtm->rtm_dst_len = 128;
4736 } else if (rtm->rtm_dst_len)
4737 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4738 goto nla_put_failure;
4739 #ifdef CONFIG_IPV6_SUBTREES
4741 if (nla_put_in6_addr(skb, RTA_SRC, src))
4742 goto nla_put_failure;
4743 rtm->rtm_src_len = 128;
4744 } else if (rtm->rtm_src_len &&
4745 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4746 goto nla_put_failure;
4749 #ifdef CONFIG_IPV6_MROUTE
4750 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4751 int err = ip6mr_get_route(net, skb, rtm, portid);
4756 goto nla_put_failure;
4759 if (nla_put_u32(skb, RTA_IIF, iif))
4760 goto nla_put_failure;
4762 struct in6_addr saddr_buf;
4763 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4764 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4765 goto nla_put_failure;
4768 if (rt->fib6_prefsrc.plen) {
4769 struct in6_addr saddr_buf;
4770 saddr_buf = rt->fib6_prefsrc.addr;
4771 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4772 goto nla_put_failure;
4775 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4776 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4777 goto nla_put_failure;
4779 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4780 goto nla_put_failure;
4782 /* For multipath routes, walk the siblings list and add
4783 * each as a nexthop within RTA_MULTIPATH.
4786 if (rt6_flags & RTF_GATEWAY &&
4787 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4788 goto nla_put_failure;
4790 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4791 goto nla_put_failure;
4792 } else if (rt->fib6_nsiblings) {
4793 struct fib6_info *sibling, *next_sibling;
4796 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4798 goto nla_put_failure;
4800 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4801 rt->fib6_nh.fib_nh_weight) < 0)
4802 goto nla_put_failure;
4804 list_for_each_entry_safe(sibling, next_sibling,
4805 &rt->fib6_siblings, fib6_siblings) {
4806 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4807 sibling->fib6_nh.fib_nh_weight) < 0)
4808 goto nla_put_failure;
4811 nla_nest_end(skb, mp);
4813 unsigned char nh_flags = 0;
4815 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4816 &nh_flags, false) < 0)
4817 goto nla_put_failure;
4819 rtm->rtm_flags |= nh_flags;
4822 if (rt6_flags & RTF_EXPIRES) {
4823 expires = dst ? dst->expires : rt->expires;
4827 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4828 goto nla_put_failure;
4830 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4831 goto nla_put_failure;
4834 nlmsg_end(skb, nlh);
4838 nlmsg_cancel(skb, nlh);
4842 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4843 const struct net_device *dev)
4845 if (f6i->fib6_nh.fib_nh_dev == dev)
4848 if (f6i->fib6_nsiblings) {
4849 struct fib6_info *sibling, *next_sibling;
4851 list_for_each_entry_safe(sibling, next_sibling,
4852 &f6i->fib6_siblings, fib6_siblings) {
4853 if (sibling->fib6_nh.fib_nh_dev == dev)
4861 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4863 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4864 struct fib_dump_filter *filter = &arg->filter;
4865 unsigned int flags = NLM_F_MULTI;
4866 struct net *net = arg->net;
4868 if (rt == net->ipv6.fib6_null_entry)
4871 if ((filter->flags & RTM_F_PREFIX) &&
4872 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4873 /* success since this is not a prefix route */
4876 if (filter->filter_set) {
4877 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4878 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4879 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4882 flags |= NLM_F_DUMP_FILTERED;
4885 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4886 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4887 arg->cb->nlh->nlmsg_seq, flags);
4890 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4891 const struct nlmsghdr *nlh,
4893 struct netlink_ext_ack *extack)
4898 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4899 NL_SET_ERR_MSG_MOD(extack,
4900 "Invalid header for get route request");
4904 if (!netlink_strict_get_check(skb))
4905 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4906 rtm_ipv6_policy, extack);
4908 rtm = nlmsg_data(nlh);
4909 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4910 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4911 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4913 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4916 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4917 NL_SET_ERR_MSG_MOD(extack,
4918 "Invalid flags for get route request");
4922 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4923 rtm_ipv6_policy, extack);
4927 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4928 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4929 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4933 for (i = 0; i <= RTA_MAX; i++) {
4949 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4957 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4958 struct netlink_ext_ack *extack)
4960 struct net *net = sock_net(in_skb->sk);
4961 struct nlattr *tb[RTA_MAX+1];
4962 int err, iif = 0, oif = 0;
4963 struct fib6_info *from;
4964 struct dst_entry *dst;
4965 struct rt6_info *rt;
4966 struct sk_buff *skb;
4968 struct flowi6 fl6 = {};
4971 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4976 rtm = nlmsg_data(nlh);
4977 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4978 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4981 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4984 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4988 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4991 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4995 iif = nla_get_u32(tb[RTA_IIF]);
4998 oif = nla_get_u32(tb[RTA_OIF]);
5001 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5004 fl6.flowi6_uid = make_kuid(current_user_ns(),
5005 nla_get_u32(tb[RTA_UID]));
5007 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5010 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5013 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5015 if (tb[RTA_IP_PROTO]) {
5016 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5017 &fl6.flowi6_proto, AF_INET6,
5024 struct net_device *dev;
5029 dev = dev_get_by_index_rcu(net, iif);
5036 fl6.flowi6_iif = iif;
5038 if (!ipv6_addr_any(&fl6.saddr))
5039 flags |= RT6_LOOKUP_F_HAS_SADDR;
5041 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5045 fl6.flowi6_oif = oif;
5047 dst = ip6_route_output(net, NULL, &fl6);
5051 rt = container_of(dst, struct rt6_info, dst);
5052 if (rt->dst.error) {
5053 err = rt->dst.error;
5058 if (rt == net->ipv6.ip6_null_entry) {
5059 err = rt->dst.error;
5064 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5071 skb_dst_set(skb, &rt->dst);
5074 from = rcu_dereference(rt->from);
5077 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5079 NETLINK_CB(in_skb).portid,
5082 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5083 &fl6.saddr, iif, RTM_NEWROUTE,
5084 NETLINK_CB(in_skb).portid,
5096 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5101 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5102 unsigned int nlm_flags)
5104 struct sk_buff *skb;
5105 struct net *net = info->nl_net;
5110 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5112 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5116 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5117 event, info->portid, seq, nlm_flags);
5119 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5120 WARN_ON(err == -EMSGSIZE);
5124 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5125 info->nlh, gfp_any());
5129 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5132 static int ip6_route_dev_notify(struct notifier_block *this,
5133 unsigned long event, void *ptr)
5135 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5136 struct net *net = dev_net(dev);
5138 if (!(dev->flags & IFF_LOOPBACK))
5141 if (event == NETDEV_REGISTER) {
5142 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5143 net->ipv6.ip6_null_entry->dst.dev = dev;
5144 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5146 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5147 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5148 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5149 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5151 } else if (event == NETDEV_UNREGISTER &&
5152 dev->reg_state != NETREG_UNREGISTERED) {
5153 /* NETDEV_UNREGISTER could be fired for multiple times by
5154 * netdev_wait_allrefs(). Make sure we only call this once.
5156 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5157 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5158 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5159 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5170 #ifdef CONFIG_PROC_FS
5171 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5173 struct net *net = (struct net *)seq->private;
5174 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5175 net->ipv6.rt6_stats->fib_nodes,
5176 net->ipv6.rt6_stats->fib_route_nodes,
5177 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5178 net->ipv6.rt6_stats->fib_rt_entries,
5179 net->ipv6.rt6_stats->fib_rt_cache,
5180 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5181 net->ipv6.rt6_stats->fib_discarded_routes);
5185 #endif /* CONFIG_PROC_FS */
5187 #ifdef CONFIG_SYSCTL
5190 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5191 void __user *buffer, size_t *lenp, loff_t *ppos)
5199 net = (struct net *)ctl->extra1;
5200 delay = net->ipv6.sysctl.flush_delay;
5201 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5205 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5212 static struct ctl_table ipv6_route_table_template[] = {
5214 .procname = "flush",
5215 .data = &init_net.ipv6.sysctl.flush_delay,
5216 .maxlen = sizeof(int),
5218 .proc_handler = ipv6_sysctl_rtcache_flush
5221 .procname = "gc_thresh",
5222 .data = &ip6_dst_ops_template.gc_thresh,
5223 .maxlen = sizeof(int),
5225 .proc_handler = proc_dointvec,
5228 .procname = "max_size",
5229 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5230 .maxlen = sizeof(int),
5232 .proc_handler = proc_dointvec,
5235 .procname = "gc_min_interval",
5236 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5237 .maxlen = sizeof(int),
5239 .proc_handler = proc_dointvec_jiffies,
5242 .procname = "gc_timeout",
5243 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5244 .maxlen = sizeof(int),
5246 .proc_handler = proc_dointvec_jiffies,
5249 .procname = "gc_interval",
5250 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5251 .maxlen = sizeof(int),
5253 .proc_handler = proc_dointvec_jiffies,
5256 .procname = "gc_elasticity",
5257 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5258 .maxlen = sizeof(int),
5260 .proc_handler = proc_dointvec,
5263 .procname = "mtu_expires",
5264 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5265 .maxlen = sizeof(int),
5267 .proc_handler = proc_dointvec_jiffies,
5270 .procname = "min_adv_mss",
5271 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5272 .maxlen = sizeof(int),
5274 .proc_handler = proc_dointvec,
5277 .procname = "gc_min_interval_ms",
5278 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5279 .maxlen = sizeof(int),
5281 .proc_handler = proc_dointvec_ms_jiffies,
5284 .procname = "skip_notify_on_dev_down",
5285 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5286 .maxlen = sizeof(int),
5288 .proc_handler = proc_dointvec,
5295 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5297 struct ctl_table *table;
5299 table = kmemdup(ipv6_route_table_template,
5300 sizeof(ipv6_route_table_template),
5304 table[0].data = &net->ipv6.sysctl.flush_delay;
5305 table[0].extra1 = net;
5306 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5307 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5308 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5309 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5310 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5311 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5312 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5313 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5314 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5315 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5317 /* Don't export sysctls to unprivileged users */
5318 if (net->user_ns != &init_user_ns)
5319 table[0].procname = NULL;
5326 static int __net_init ip6_route_net_init(struct net *net)
5330 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5331 sizeof(net->ipv6.ip6_dst_ops));
5333 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5334 goto out_ip6_dst_ops;
5336 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5337 sizeof(*net->ipv6.fib6_null_entry),
5339 if (!net->ipv6.fib6_null_entry)
5340 goto out_ip6_dst_entries;
5342 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5343 sizeof(*net->ipv6.ip6_null_entry),
5345 if (!net->ipv6.ip6_null_entry)
5346 goto out_fib6_null_entry;
5347 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5348 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5349 ip6_template_metrics, true);
5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5352 net->ipv6.fib6_has_custom_rules = false;
5353 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5354 sizeof(*net->ipv6.ip6_prohibit_entry),
5356 if (!net->ipv6.ip6_prohibit_entry)
5357 goto out_ip6_null_entry;
5358 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5359 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5360 ip6_template_metrics, true);
5362 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5363 sizeof(*net->ipv6.ip6_blk_hole_entry),
5365 if (!net->ipv6.ip6_blk_hole_entry)
5366 goto out_ip6_prohibit_entry;
5367 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5368 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5369 ip6_template_metrics, true);
5372 net->ipv6.sysctl.flush_delay = 0;
5373 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5374 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5375 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5376 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5377 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5378 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5379 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5380 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5382 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5388 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5389 out_ip6_prohibit_entry:
5390 kfree(net->ipv6.ip6_prohibit_entry);
5392 kfree(net->ipv6.ip6_null_entry);
5394 out_fib6_null_entry:
5395 kfree(net->ipv6.fib6_null_entry);
5396 out_ip6_dst_entries:
5397 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5402 static void __net_exit ip6_route_net_exit(struct net *net)
5404 kfree(net->ipv6.fib6_null_entry);
5405 kfree(net->ipv6.ip6_null_entry);
5406 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5407 kfree(net->ipv6.ip6_prohibit_entry);
5408 kfree(net->ipv6.ip6_blk_hole_entry);
5410 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5413 static int __net_init ip6_route_net_init_late(struct net *net)
5415 #ifdef CONFIG_PROC_FS
5416 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5417 sizeof(struct ipv6_route_iter));
5418 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5419 rt6_stats_seq_show, NULL);
5424 static void __net_exit ip6_route_net_exit_late(struct net *net)
5426 #ifdef CONFIG_PROC_FS
5427 remove_proc_entry("ipv6_route", net->proc_net);
5428 remove_proc_entry("rt6_stats", net->proc_net);
5432 static struct pernet_operations ip6_route_net_ops = {
5433 .init = ip6_route_net_init,
5434 .exit = ip6_route_net_exit,
5437 static int __net_init ipv6_inetpeer_init(struct net *net)
5439 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5443 inet_peer_base_init(bp);
5444 net->ipv6.peers = bp;
5448 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5450 struct inet_peer_base *bp = net->ipv6.peers;
5452 net->ipv6.peers = NULL;
5453 inetpeer_invalidate_tree(bp);
5457 static struct pernet_operations ipv6_inetpeer_ops = {
5458 .init = ipv6_inetpeer_init,
5459 .exit = ipv6_inetpeer_exit,
5462 static struct pernet_operations ip6_route_net_late_ops = {
5463 .init = ip6_route_net_init_late,
5464 .exit = ip6_route_net_exit_late,
5467 static struct notifier_block ip6_route_dev_notifier = {
5468 .notifier_call = ip6_route_dev_notify,
5469 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5472 void __init ip6_route_init_special_entries(void)
5474 /* Registering of the loopback is done before this portion of code,
5475 * the loopback reference in rt6_info will not be taken, do it
5476 * manually for init_net */
5477 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5478 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5479 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5480 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5481 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5482 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5483 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5484 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5488 int __init ip6_route_init(void)
5494 ip6_dst_ops_template.kmem_cachep =
5495 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5496 SLAB_HWCACHE_ALIGN, NULL);
5497 if (!ip6_dst_ops_template.kmem_cachep)
5500 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5502 goto out_kmem_cache;
5504 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5506 goto out_dst_entries;
5508 ret = register_pernet_subsys(&ip6_route_net_ops);
5510 goto out_register_inetpeer;
5512 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5516 goto out_register_subsys;
5522 ret = fib6_rules_init();
5526 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5528 goto fib6_rules_init;
5530 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5531 inet6_rtm_newroute, NULL, 0);
5533 goto out_register_late_subsys;
5535 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5536 inet6_rtm_delroute, NULL, 0);
5538 goto out_register_late_subsys;
5540 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5541 inet6_rtm_getroute, NULL,
5542 RTNL_FLAG_DOIT_UNLOCKED);
5544 goto out_register_late_subsys;
5546 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5548 goto out_register_late_subsys;
5550 for_each_possible_cpu(cpu) {
5551 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5553 INIT_LIST_HEAD(&ul->head);
5554 spin_lock_init(&ul->lock);
5560 out_register_late_subsys:
5561 rtnl_unregister_all(PF_INET6);
5562 unregister_pernet_subsys(&ip6_route_net_late_ops);
5564 fib6_rules_cleanup();
5569 out_register_subsys:
5570 unregister_pernet_subsys(&ip6_route_net_ops);
5571 out_register_inetpeer:
5572 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5574 dst_entries_destroy(&ip6_dst_blackhole_ops);
5576 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5580 void ip6_route_cleanup(void)
5582 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5583 unregister_pernet_subsys(&ip6_route_net_late_ops);
5584 fib6_rules_cleanup();
5587 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5588 unregister_pernet_subsys(&ip6_route_net_ops);
5589 dst_entries_destroy(&ip6_dst_blackhole_ops);
5590 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);