2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
482 if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !rt->fib6_nh.fib_nh_has_gw)
539 nh_gw = &rt->fib6_nh.fib_nh_gw6;
540 dev = rt->fib6_nh.fib_nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.fib_nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !rt->fib6_nh.fib_nh_has_gw)
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev,
603 &rt->fib6_nh.fib_nh_gw6);
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
643 int *mpri, struct fib6_info *match,
647 bool match_do_rr = false;
649 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
652 if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) &&
653 rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
654 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
657 if (fib6_check_expired(rt))
660 m = rt6_score_route(rt, oif, strict);
661 if (m == RT6_NUD_FAIL_DO_RR) {
663 m = 0; /* lowest valid score */
664 } else if (m == RT6_NUD_FAIL_HARD) {
668 if (strict & RT6_LOOKUP_F_REACHABLE)
671 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
673 *do_rr = match_do_rr;
681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
682 struct fib6_info *leaf,
683 struct fib6_info *rr_head,
684 u32 metric, int oif, int strict,
687 struct fib6_info *rt, *match, *cont;
692 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
693 if (rt->fib6_metric != metric) {
698 match = find_match(rt, oif, strict, &mpri, match, do_rr);
701 for (rt = leaf; rt && rt != rr_head;
702 rt = rcu_dereference(rt->fib6_next)) {
703 if (rt->fib6_metric != metric) {
708 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
723 struct fib6_info *leaf = rcu_dereference(fn->leaf);
724 struct fib6_info *match, *rt0;
728 if (!leaf || leaf == net->ipv6.fib6_null_entry)
729 return net->ipv6.fib6_null_entry;
731 rt0 = rcu_dereference(fn->rr_ptr);
735 /* Double check to make sure fn is not an intermediate node
736 * and fn->leaf does not points to its child's leaf
737 * (This might happen if all routes under fn are deleted from
738 * the tree and fib6_repair_tree() is called on the node.)
740 key_plen = rt0->fib6_dst.plen;
741 #ifdef CONFIG_IPV6_SUBTREES
742 if (rt0->fib6_src.plen)
743 key_plen = rt0->fib6_src.plen;
745 if (fn->fn_bit != key_plen)
746 return net->ipv6.fib6_null_entry;
748 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
752 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
754 /* no entries matched; do round-robin */
755 if (!next || next->fib6_metric != rt0->fib6_metric)
759 spin_lock_bh(&leaf->fib6_table->tb6_lock);
760 /* make sure next is not being deleted from the tree */
762 rcu_assign_pointer(fn->rr_ptr, next);
763 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
767 return match ? match : net->ipv6.fib6_null_entry;
770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
772 return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw;
775 #ifdef CONFIG_IPV6_ROUTE_INFO
776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777 const struct in6_addr *gwaddr)
779 struct net *net = dev_net(dev);
780 struct route_info *rinfo = (struct route_info *) opt;
781 struct in6_addr prefix_buf, *prefix;
783 unsigned long lifetime;
784 struct fib6_info *rt;
786 if (len < sizeof(struct route_info)) {
790 /* Sanity check for prefix_len and length */
791 if (rinfo->length > 3) {
793 } else if (rinfo->prefix_len > 128) {
795 } else if (rinfo->prefix_len > 64) {
796 if (rinfo->length < 2) {
799 } else if (rinfo->prefix_len > 0) {
800 if (rinfo->length < 1) {
805 pref = rinfo->route_pref;
806 if (pref == ICMPV6_ROUTER_PREF_INVALID)
809 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
811 if (rinfo->length == 3)
812 prefix = (struct in6_addr *)rinfo->prefix;
814 /* this function is safe */
815 ipv6_addr_prefix(&prefix_buf,
816 (struct in6_addr *)rinfo->prefix,
818 prefix = &prefix_buf;
821 if (rinfo->prefix_len == 0)
822 rt = rt6_get_dflt_router(net, gwaddr, dev);
824 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
827 if (rt && !lifetime) {
833 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
836 rt->fib6_flags = RTF_ROUTEINFO |
837 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
840 if (!addrconf_finite_timeout(lifetime))
841 fib6_clean_expires(rt);
843 fib6_set_expires(rt, jiffies + HZ * lifetime);
845 fib6_info_release(rt);
852 * Misc support functions
855 /* called with rcu_lock held */
856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
858 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
860 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861 /* for copies of local routes, dst->dev needs to be the
862 * device if it is a master device, the master device if
863 * device is enslaved, and the loopback as the default
865 if (netif_is_l3_slave(dev) &&
866 !rt6_need_strict(&rt->fib6_dst.addr))
867 dev = l3mdev_master_dev_rcu(dev);
868 else if (!netif_is_l3_master(dev))
869 dev = dev_net(dev)->loopback_dev;
870 /* last case is netif_is_l3_master(dev) is true in which
871 * case we want dev returned to be dev
878 static const int fib6_prop[RTN_MAX + 1] = {
885 [RTN_BLACKHOLE] = -EINVAL,
886 [RTN_UNREACHABLE] = -EHOSTUNREACH,
887 [RTN_PROHIBIT] = -EACCES,
888 [RTN_THROW] = -EAGAIN,
890 [RTN_XRESOLVE] = -EINVAL,
893 static int ip6_rt_type_to_error(u8 fib6_type)
895 return fib6_prop[fib6_type];
898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
900 unsigned short flags = 0;
903 flags |= DST_NOCOUNT;
904 if (rt->dst_nopolicy)
905 flags |= DST_NOPOLICY;
912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
914 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
916 switch (ort->fib6_type) {
918 rt->dst.output = dst_discard_out;
919 rt->dst.input = dst_discard;
922 rt->dst.output = ip6_pkt_prohibit_out;
923 rt->dst.input = ip6_pkt_prohibit;
926 case RTN_UNREACHABLE:
928 rt->dst.output = ip6_pkt_discard_out;
929 rt->dst.input = ip6_pkt_discard;
934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
936 if (ort->fib6_flags & RTF_REJECT) {
937 ip6_rt_init_dst_reject(rt, ort);
942 rt->dst.output = ip6_output;
944 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
945 rt->dst.input = ip6_input;
946 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
947 rt->dst.input = ip6_mc_input;
949 rt->dst.input = ip6_forward;
952 if (ort->fib6_nh.fib_nh_lws) {
953 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
954 lwtunnel_set_redirect(&rt->dst);
957 rt->dst.lastuse = jiffies;
960 /* Caller must already hold reference to @from */
961 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
963 rt->rt6i_flags &= ~RTF_EXPIRES;
964 rcu_assign_pointer(rt->from, from);
965 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
968 /* Caller must already hold reference to @ort */
969 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
971 struct net_device *dev = fib6_info_nh_dev(ort);
973 ip6_rt_init_dst(rt, ort);
975 rt->rt6i_dst = ort->fib6_dst;
976 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
977 rt->rt6i_flags = ort->fib6_flags;
978 if (ort->fib6_nh.fib_nh_has_gw) {
979 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
980 rt->rt6i_flags |= RTF_GATEWAY;
982 rt6_set_from(rt, ort);
983 #ifdef CONFIG_IPV6_SUBTREES
984 rt->rt6i_src = ort->fib6_src;
988 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
989 struct in6_addr *saddr)
991 struct fib6_node *pn, *sn;
993 if (fn->fn_flags & RTN_TL_ROOT)
995 pn = rcu_dereference(fn->parent);
996 sn = FIB6_SUBTREE(pn);
998 fn = fib6_node_lookup(sn, NULL, saddr);
1001 if (fn->fn_flags & RTN_RTINFO)
1006 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1008 struct rt6_info *rt = *prt;
1010 if (dst_hold_safe(&rt->dst))
1013 rt = net->ipv6.ip6_null_entry;
1022 /* called with rcu_lock held */
1023 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1025 unsigned short flags = fib6_info_dst_flags(rt);
1026 struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1027 struct rt6_info *nrt;
1029 if (!fib6_info_hold_safe(rt))
1032 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1034 fib6_info_release(rt);
1038 ip6_rt_copy_init(nrt, rt);
1042 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1043 dst_hold(&nrt->dst);
1047 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1048 struct fib6_table *table,
1050 const struct sk_buff *skb,
1053 struct fib6_info *f6i;
1054 struct fib6_node *fn;
1055 struct rt6_info *rt;
1057 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1058 flags &= ~RT6_LOOKUP_F_IFACE;
1061 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 f6i = rcu_dereference(fn->leaf);
1065 f6i = net->ipv6.fib6_null_entry;
1067 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1068 fl6->flowi6_oif, flags);
1069 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1070 f6i = fib6_multipath_select(net, f6i, fl6,
1071 fl6->flowi6_oif, skb,
1074 if (f6i == net->ipv6.fib6_null_entry) {
1075 fn = fib6_backtrack(fn, &fl6->saddr);
1080 trace_fib6_table_lookup(net, f6i, table, fl6);
1082 /* Search through exception table */
1083 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1085 if (ip6_hold_safe(net, &rt))
1086 dst_use_noref(&rt->dst, jiffies);
1087 } else if (f6i == net->ipv6.fib6_null_entry) {
1088 rt = net->ipv6.ip6_null_entry;
1091 rt = ip6_create_rt_rcu(f6i);
1099 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1100 const struct sk_buff *skb, int flags)
1102 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1104 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1106 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1107 const struct in6_addr *saddr, int oif,
1108 const struct sk_buff *skb, int strict)
1110 struct flowi6 fl6 = {
1114 struct dst_entry *dst;
1115 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1118 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1119 flags |= RT6_LOOKUP_F_HAS_SADDR;
1122 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1123 if (dst->error == 0)
1124 return (struct rt6_info *) dst;
1130 EXPORT_SYMBOL(rt6_lookup);
1132 /* ip6_ins_rt is called with FREE table->tb6_lock.
1133 * It takes new route entry, the addition fails by any reason the
1134 * route is released.
1135 * Caller must hold dst before calling it.
1138 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1139 struct netlink_ext_ack *extack)
1142 struct fib6_table *table;
1144 table = rt->fib6_table;
1145 spin_lock_bh(&table->tb6_lock);
1146 err = fib6_add(&table->tb6_root, rt, info, extack);
1147 spin_unlock_bh(&table->tb6_lock);
1152 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1154 struct nl_info info = { .nl_net = net, };
1156 return __ip6_ins_rt(rt, &info, NULL);
1159 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1160 const struct in6_addr *daddr,
1161 const struct in6_addr *saddr)
1163 struct net_device *dev;
1164 struct rt6_info *rt;
1170 if (!fib6_info_hold_safe(ort))
1173 dev = ip6_rt_get_dev_rcu(ort);
1174 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1176 fib6_info_release(ort);
1180 ip6_rt_copy_init(rt, ort);
1181 rt->rt6i_flags |= RTF_CACHE;
1182 rt->dst.flags |= DST_HOST;
1183 rt->rt6i_dst.addr = *daddr;
1184 rt->rt6i_dst.plen = 128;
1186 if (!rt6_is_gw_or_nonexthop(ort)) {
1187 if (ort->fib6_dst.plen != 128 &&
1188 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189 rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191 if (rt->rt6i_src.plen && saddr) {
1192 rt->rt6i_src.addr = *saddr;
1193 rt->rt6i_src.plen = 128;
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1203 unsigned short flags = fib6_info_dst_flags(rt);
1204 struct net_device *dev;
1205 struct rt6_info *pcpu_rt;
1207 if (!fib6_info_hold_safe(rt))
1211 dev = ip6_rt_get_dev_rcu(rt);
1212 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1215 fib6_info_release(rt);
1218 ip6_rt_copy_init(pcpu_rt, rt);
1219 pcpu_rt->rt6i_flags |= RTF_PCPU;
1223 /* It should be called with rcu_read_lock() acquired */
1224 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1226 struct rt6_info *pcpu_rt, **p;
1228 p = this_cpu_ptr(rt->rt6i_pcpu);
1232 ip6_hold_safe(NULL, &pcpu_rt);
1237 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1238 struct fib6_info *rt)
1240 struct rt6_info *pcpu_rt, *prev, **p;
1242 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1244 dst_hold(&net->ipv6.ip6_null_entry->dst);
1245 return net->ipv6.ip6_null_entry;
1248 dst_hold(&pcpu_rt->dst);
1249 p = this_cpu_ptr(rt->rt6i_pcpu);
1250 prev = cmpxchg(p, NULL, pcpu_rt);
1256 /* exception hash table implementation
1258 static DEFINE_SPINLOCK(rt6_exception_lock);
1260 /* Remove rt6_ex from hash table and free the memory
1261 * Caller must hold rt6_exception_lock
1263 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1264 struct rt6_exception *rt6_ex)
1266 struct fib6_info *from;
1269 if (!bucket || !rt6_ex)
1272 net = dev_net(rt6_ex->rt6i->dst.dev);
1273 net->ipv6.rt6_stats->fib_rt_cache--;
1275 /* purge completely the exception to allow releasing the held resources:
1276 * some [sk] cache may keep the dst around for unlimited time
1278 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1279 lockdep_is_held(&rt6_exception_lock));
1280 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1281 fib6_info_release(from);
1282 dst_dev_put(&rt6_ex->rt6i->dst);
1284 hlist_del_rcu(&rt6_ex->hlist);
1285 dst_release(&rt6_ex->rt6i->dst);
1286 kfree_rcu(rt6_ex, rcu);
1287 WARN_ON_ONCE(!bucket->depth);
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292 * Caller must hold rt6_exception_lock
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1296 struct rt6_exception *rt6_ex, *oldest = NULL;
1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1305 rt6_remove_exception(bucket, oldest);
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309 const struct in6_addr *src)
1311 static u32 seed __read_mostly;
1314 net_get_random_once(&seed, sizeof(seed));
1315 val = jhash(dst, sizeof(*dst), seed);
1317 #ifdef CONFIG_IPV6_SUBTREES
1319 val = jhash(src, sizeof(*src), val);
1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1324 /* Helper function to find the cached rt in the hash table
1325 * and update bucket pointer to point to the bucket for this
1326 * (daddr, saddr) pair
1327 * Caller must hold rt6_exception_lock
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331 const struct in6_addr *daddr,
1332 const struct in6_addr *saddr)
1334 struct rt6_exception *rt6_ex;
1337 if (!(*bucket) || !daddr)
1340 hval = rt6_exception_hash(daddr, saddr);
1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344 struct rt6_info *rt6 = rt6_ex->rt6i;
1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1347 #ifdef CONFIG_IPV6_SUBTREES
1348 if (matched && saddr)
1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1357 /* Helper function to find the cached rt in the hash table
1358 * and update bucket pointer to point to the bucket for this
1359 * (daddr, saddr) pair
1360 * Caller must hold rcu_read_lock()
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364 const struct in6_addr *daddr,
1365 const struct in6_addr *saddr)
1367 struct rt6_exception *rt6_ex;
1370 WARN_ON_ONCE(!rcu_read_lock_held());
1372 if (!(*bucket) || !daddr)
1375 hval = rt6_exception_hash(daddr, saddr);
1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379 struct rt6_info *rt6 = rt6_ex->rt6i;
1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1382 #ifdef CONFIG_IPV6_SUBTREES
1383 if (matched && saddr)
1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1396 if (rt->fib6_pmtu) {
1397 mtu = rt->fib6_pmtu;
1399 struct net_device *dev = fib6_info_nh_dev(rt);
1400 struct inet6_dev *idev;
1403 idev = __in6_dev_get(dev);
1404 mtu = idev->cnf.mtu6;
1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1410 return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414 struct fib6_info *ort)
1416 struct net *net = dev_net(nrt->dst.dev);
1417 struct rt6_exception_bucket *bucket;
1418 struct in6_addr *src_key = NULL;
1419 struct rt6_exception *rt6_ex;
1422 spin_lock_bh(&rt6_exception_lock);
1424 if (ort->exception_bucket_flushed) {
1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430 lockdep_is_held(&rt6_exception_lock));
1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 /* rt6i_src.plen != 0 indicates ort is in subtree
1443 * and exception table is indexed by a hash of
1444 * both rt6i_dst and rt6i_src.
1445 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst.
1448 if (ort->fib6_src.plen)
1449 src_key = &nrt->rt6i_src.addr;
1451 /* rt6_mtu_change() might lower mtu on ort.
1452 * Only insert this exception route if its mtu
1453 * is less than ort's mtu value.
1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1463 rt6_remove_exception(bucket, rt6_ex);
1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1471 rt6_ex->stamp = jiffies;
1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1474 net->ipv6.rt6_stats->fib_rt_cache++;
1476 if (bucket->depth > FIB6_MAX_DEPTH)
1477 rt6_exception_remove_oldest(bucket);
1480 spin_unlock_bh(&rt6_exception_lock);
1482 /* Update fn->fn_sernum to invalidate all cached dst */
1484 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485 fib6_update_sernum(net, ort);
1486 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487 fib6_force_start_gc(net);
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1495 struct rt6_exception_bucket *bucket;
1496 struct rt6_exception *rt6_ex;
1497 struct hlist_node *tmp;
1500 spin_lock_bh(&rt6_exception_lock);
1501 /* Prevent rt6_insert_exception() to recreate the bucket list */
1502 rt->exception_bucket_flushed = 1;
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511 rt6_remove_exception(bucket, rt6_ex);
1512 WARN_ON_ONCE(bucket->depth);
1517 spin_unlock_bh(&rt6_exception_lock);
1520 /* Find cached rt in the hash table inside passed in rt
1521 * Caller has to hold rcu_read_lock()
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524 struct in6_addr *daddr,
1525 struct in6_addr *saddr)
1527 struct rt6_exception_bucket *bucket;
1528 struct in6_addr *src_key = NULL;
1529 struct rt6_exception *rt6_ex;
1530 struct rt6_info *res = NULL;
1532 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1534 #ifdef CONFIG_IPV6_SUBTREES
1535 /* rt6i_src.plen != 0 indicates rt is in subtree
1536 * and exception table is indexed by a hash of
1537 * both rt6i_dst and rt6i_src.
1538 * Otherwise, the exception table is indexed by
1539 * a hash of only rt6i_dst.
1541 if (rt->fib6_src.plen)
1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1555 struct rt6_exception_bucket *bucket;
1556 struct in6_addr *src_key = NULL;
1557 struct rt6_exception *rt6_ex;
1558 struct fib6_info *from;
1561 from = rcu_dereference(rt->from);
1563 !(rt->rt6i_flags & RTF_CACHE))
1566 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1569 spin_lock_bh(&rt6_exception_lock);
1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571 lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574 * and exception table is indexed by a hash of
1575 * both rt6i_dst and rt6i_src.
1576 * Otherwise, the exception table is indexed by
1577 * a hash of only rt6i_dst.
1579 if (from->fib6_src.plen)
1580 src_key = &rt->rt6i_src.addr;
1582 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1586 rt6_remove_exception(bucket, rt6_ex);
1592 spin_unlock_bh(&rt6_exception_lock);
1596 /* Find rt6_ex which contains the passed in rt cache and
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1601 struct rt6_exception_bucket *bucket;
1602 struct in6_addr *src_key = NULL;
1603 struct rt6_exception *rt6_ex;
1604 struct fib6_info *from;
1607 from = rcu_dereference(rt->from);
1608 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1611 bucket = rcu_dereference(from->rt6i_exception_bucket);
1613 #ifdef CONFIG_IPV6_SUBTREES
1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615 * and exception table is indexed by a hash of
1616 * both rt6i_dst and rt6i_src.
1617 * Otherwise, the exception table is indexed by
1618 * a hash of only rt6i_dst.
1620 if (from->fib6_src.plen)
1621 src_key = &rt->rt6i_src.addr;
1623 rt6_ex = __rt6_find_exception_rcu(&bucket,
1627 rt6_ex->stamp = jiffies;
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634 struct rt6_info *rt, int mtu)
1636 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637 * lowest MTU in the path: always allow updating the route PMTU to
1638 * reflect PMTU decreases.
1640 * If the new MTU is higher, and the route PMTU is equal to the local
1641 * MTU, this means the old MTU is the lowest in the path, so allow
1642 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1646 if (dst_mtu(&rt->dst) >= mtu)
1649 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656 struct fib6_info *rt, int mtu)
1658 struct rt6_exception_bucket *bucket;
1659 struct rt6_exception *rt6_ex;
1662 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663 lockdep_is_held(&rt6_exception_lock));
1668 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670 struct rt6_info *entry = rt6_ex->rt6i;
1672 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673 * route), the metrics of its rt->from have already
1676 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677 rt6_mtu_change_route_allowed(idev, entry, mtu))
1678 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1684 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687 struct in6_addr *gateway)
1689 struct rt6_exception_bucket *bucket;
1690 struct rt6_exception *rt6_ex;
1691 struct hlist_node *tmp;
1694 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1697 spin_lock_bh(&rt6_exception_lock);
1698 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699 lockdep_is_held(&rt6_exception_lock));
1702 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703 hlist_for_each_entry_safe(rt6_ex, tmp,
1704 &bucket->chain, hlist) {
1705 struct rt6_info *entry = rt6_ex->rt6i;
1707 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708 RTF_CACHE_GATEWAY &&
1709 ipv6_addr_equal(gateway,
1710 &entry->rt6i_gateway)) {
1711 rt6_remove_exception(bucket, rt6_ex);
1718 spin_unlock_bh(&rt6_exception_lock);
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722 struct rt6_exception *rt6_ex,
1723 struct fib6_gc_args *gc_args,
1726 struct rt6_info *rt = rt6_ex->rt6i;
1728 /* we are pruning and obsoleting aged-out and non gateway exceptions
1729 * even if others have still references to them, so that on next
1730 * dst_check() such references can be dropped.
1731 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732 * expired, independently from their aging, as per RFC 8201 section 4
1734 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736 RT6_TRACE("aging clone %p\n", rt);
1737 rt6_remove_exception(bucket, rt6_ex);
1740 } else if (time_after(jiffies, rt->dst.expires)) {
1741 RT6_TRACE("purging expired route %p\n", rt);
1742 rt6_remove_exception(bucket, rt6_ex);
1746 if (rt->rt6i_flags & RTF_GATEWAY) {
1747 struct neighbour *neigh;
1748 __u8 neigh_flags = 0;
1750 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1752 neigh_flags = neigh->flags;
1754 if (!(neigh_flags & NTF_ROUTER)) {
1755 RT6_TRACE("purging route %p via non-router but gateway\n",
1757 rt6_remove_exception(bucket, rt6_ex);
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766 struct fib6_gc_args *gc_args,
1769 struct rt6_exception_bucket *bucket;
1770 struct rt6_exception *rt6_ex;
1771 struct hlist_node *tmp;
1774 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1778 spin_lock(&rt6_exception_lock);
1779 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780 lockdep_is_held(&rt6_exception_lock));
1783 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784 hlist_for_each_entry_safe(rt6_ex, tmp,
1785 &bucket->chain, hlist) {
1786 rt6_age_examine_exception(bucket, rt6_ex,
1792 spin_unlock(&rt6_exception_lock);
1793 rcu_read_unlock_bh();
1796 /* must be called with rcu lock held */
1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1798 int oif, struct flowi6 *fl6, int strict)
1800 struct fib6_node *fn, *saved_fn;
1801 struct fib6_info *f6i;
1803 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1806 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1810 f6i = rt6_select(net, fn, oif, strict);
1811 if (f6i == net->ipv6.fib6_null_entry) {
1812 fn = fib6_backtrack(fn, &fl6->saddr);
1814 goto redo_rt6_select;
1815 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1816 /* also consider unreachable route */
1817 strict &= ~RT6_LOOKUP_F_REACHABLE;
1819 goto redo_rt6_select;
1823 trace_fib6_table_lookup(net, f6i, table, fl6);
1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1829 int oif, struct flowi6 *fl6,
1830 const struct sk_buff *skb, int flags)
1832 struct fib6_info *f6i;
1833 struct rt6_info *rt;
1836 strict |= flags & RT6_LOOKUP_F_IFACE;
1837 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1838 if (net->ipv6.devconf_all->forwarding == 0)
1839 strict |= RT6_LOOKUP_F_REACHABLE;
1843 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1844 if (f6i->fib6_nsiblings)
1845 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1847 if (f6i == net->ipv6.fib6_null_entry) {
1848 rt = net->ipv6.ip6_null_entry;
1854 /*Search through exception table */
1855 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1857 if (ip6_hold_safe(net, &rt))
1858 dst_use_noref(&rt->dst, jiffies);
1862 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863 !f6i->fib6_nh.fib_nh_has_gw)) {
1864 /* Create a RTF_CACHE clone which will not be
1865 * owned by the fib6 tree. It is for the special case where
1866 * the daddr in the skb during the neighbor look-up is different
1867 * from the fl6->daddr used to look-up route here.
1869 struct rt6_info *uncached_rt;
1871 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1876 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1877 * No need for another dst_hold()
1879 rt6_uncached_list_add(uncached_rt);
1880 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1882 uncached_rt = net->ipv6.ip6_null_entry;
1883 dst_hold(&uncached_rt->dst);
1888 /* Get a percpu copy */
1890 struct rt6_info *pcpu_rt;
1893 pcpu_rt = rt6_get_pcpu_route(f6i);
1896 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1904 EXPORT_SYMBOL_GPL(ip6_pol_route);
1906 static struct rt6_info *ip6_pol_route_input(struct net *net,
1907 struct fib6_table *table,
1909 const struct sk_buff *skb,
1912 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1915 struct dst_entry *ip6_route_input_lookup(struct net *net,
1916 struct net_device *dev,
1918 const struct sk_buff *skb,
1921 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1922 flags |= RT6_LOOKUP_F_IFACE;
1924 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929 struct flow_keys *keys,
1930 struct flow_keys *flkeys)
1932 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1933 const struct ipv6hdr *key_iph = outer_iph;
1934 struct flow_keys *_flkeys = flkeys;
1935 const struct ipv6hdr *inner_iph;
1936 const struct icmp6hdr *icmph;
1937 struct ipv6hdr _inner_iph;
1938 struct icmp6hdr _icmph;
1940 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1943 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1944 sizeof(_icmph), &_icmph);
1948 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1949 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1950 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1951 icmph->icmp6_type != ICMPV6_PARAMPROB)
1954 inner_iph = skb_header_pointer(skb,
1955 skb_transport_offset(skb) + sizeof(*icmph),
1956 sizeof(_inner_iph), &_inner_iph);
1960 key_iph = inner_iph;
1964 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1965 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1966 keys->tags.flow_label = _flkeys->tags.flow_label;
1967 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1969 keys->addrs.v6addrs.src = key_iph->saddr;
1970 keys->addrs.v6addrs.dst = key_iph->daddr;
1971 keys->tags.flow_label = ip6_flowlabel(key_iph);
1972 keys->basic.ip_proto = key_iph->nexthdr;
1976 /* if skb is set it will be used and fl6 can be NULL */
1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1978 const struct sk_buff *skb, struct flow_keys *flkeys)
1980 struct flow_keys hash_keys;
1983 switch (ip6_multipath_hash_policy(net)) {
1985 memset(&hash_keys, 0, sizeof(hash_keys));
1986 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1988 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1990 hash_keys.addrs.v6addrs.src = fl6->saddr;
1991 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1998 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1999 struct flow_keys keys;
2001 /* short-circuit if we already have L4 hash present */
2003 return skb_get_hash_raw(skb) >> 1;
2005 memset(&hash_keys, 0, sizeof(hash_keys));
2008 skb_flow_dissect_flow_keys(skb, &keys, flag);
2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2013 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2014 hash_keys.ports.src = flkeys->ports.src;
2015 hash_keys.ports.dst = flkeys->ports.dst;
2016 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020 hash_keys.addrs.v6addrs.src = fl6->saddr;
2021 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2022 hash_keys.ports.src = fl6->fl6_sport;
2023 hash_keys.ports.dst = fl6->fl6_dport;
2024 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2028 mhash = flow_hash_from_keys(&hash_keys);
2033 void ip6_route_input(struct sk_buff *skb)
2035 const struct ipv6hdr *iph = ipv6_hdr(skb);
2036 struct net *net = dev_net(skb->dev);
2037 int flags = RT6_LOOKUP_F_HAS_SADDR;
2038 struct ip_tunnel_info *tun_info;
2039 struct flowi6 fl6 = {
2040 .flowi6_iif = skb->dev->ifindex,
2041 .daddr = iph->daddr,
2042 .saddr = iph->saddr,
2043 .flowlabel = ip6_flowinfo(iph),
2044 .flowi6_mark = skb->mark,
2045 .flowi6_proto = iph->nexthdr,
2047 struct flow_keys *flkeys = NULL, _flkeys;
2049 tun_info = skb_tunnel_info(skb);
2050 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2053 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2056 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2060 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2063 static struct rt6_info *ip6_pol_route_output(struct net *net,
2064 struct fib6_table *table,
2066 const struct sk_buff *skb,
2069 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2073 struct flowi6 *fl6, int flags)
2077 if (ipv6_addr_type(&fl6->daddr) &
2078 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079 struct dst_entry *dst;
2081 dst = l3mdev_link_scope_lookup(net, fl6);
2086 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2088 any_src = ipv6_addr_any(&fl6->saddr);
2089 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090 (fl6->flowi6_oif && any_src))
2091 flags |= RT6_LOOKUP_F_IFACE;
2094 flags |= RT6_LOOKUP_F_HAS_SADDR;
2096 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2098 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2104 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105 struct net_device *loopback_dev = net->loopback_dev;
2106 struct dst_entry *new = NULL;
2108 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109 DST_OBSOLETE_DEAD, 0);
2112 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2116 new->input = dst_discard;
2117 new->output = dst_discard_out;
2119 dst_copy_metrics(new, &ort->dst);
2121 rt->rt6i_idev = in6_dev_get(loopback_dev);
2122 rt->rt6i_gateway = ort->rt6i_gateway;
2123 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2125 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2126 #ifdef CONFIG_IPV6_SUBTREES
2127 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2131 dst_release(dst_orig);
2132 return new ? new : ERR_PTR(-ENOMEM);
2136 * Destination cache support functions
2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2143 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2146 if (fib6_check_expired(f6i))
2152 static struct dst_entry *rt6_check(struct rt6_info *rt,
2153 struct fib6_info *from,
2158 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159 rt_cookie != cookie)
2162 if (rt6_check_expired(rt))
2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2169 struct fib6_info *from,
2172 if (!__rt6_check_expired(rt) &&
2173 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174 fib6_check(from, cookie))
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2182 struct dst_entry *dst_ret;
2183 struct fib6_info *from;
2184 struct rt6_info *rt;
2186 rt = container_of(dst, struct rt6_info, dst);
2190 /* All IPV6 dsts are created with ->obsolete set to the value
2191 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2192 * into this function always.
2195 from = rcu_dereference(rt->from);
2197 if (from && (rt->rt6i_flags & RTF_PCPU ||
2198 unlikely(!list_empty(&rt->rt6i_uncached))))
2199 dst_ret = rt6_dst_from_check(rt, from, cookie);
2201 dst_ret = rt6_check(rt, from, cookie);
2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2210 struct rt6_info *rt = (struct rt6_info *) dst;
2213 if (rt->rt6i_flags & RTF_CACHE) {
2215 if (rt6_check_expired(rt)) {
2216 rt6_remove_exception_rt(rt);
2228 static void ip6_link_failure(struct sk_buff *skb)
2230 struct rt6_info *rt;
2232 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2234 rt = (struct rt6_info *) skb_dst(skb);
2237 if (rt->rt6i_flags & RTF_CACHE) {
2238 rt6_remove_exception_rt(rt);
2240 struct fib6_info *from;
2241 struct fib6_node *fn;
2243 from = rcu_dereference(rt->from);
2245 fn = rcu_dereference(from->fib6_node);
2246 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2256 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2257 struct fib6_info *from;
2260 from = rcu_dereference(rt0->from);
2262 rt0->dst.expires = from->expires;
2266 dst_set_expires(&rt0->dst, timeout);
2267 rt0->rt6i_flags |= RTF_EXPIRES;
2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2272 struct net *net = dev_net(rt->dst.dev);
2274 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275 rt->rt6i_flags |= RTF_MODIFIED;
2276 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2281 return !(rt->rt6i_flags & RTF_CACHE) &&
2282 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2285 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2286 const struct ipv6hdr *iph, u32 mtu)
2288 const struct in6_addr *daddr, *saddr;
2289 struct rt6_info *rt6 = (struct rt6_info *)dst;
2291 if (dst_metric_locked(dst, RTAX_MTU))
2295 daddr = &iph->daddr;
2296 saddr = &iph->saddr;
2298 daddr = &sk->sk_v6_daddr;
2299 saddr = &inet6_sk(sk)->saddr;
2304 dst_confirm_neigh(dst, daddr);
2305 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2306 if (mtu >= dst_mtu(dst))
2309 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310 rt6_do_update_pmtu(rt6, mtu);
2311 /* update rt6_ex->stamp for cache */
2312 if (rt6->rt6i_flags & RTF_CACHE)
2313 rt6_update_exception_stamp_rt(rt6);
2315 struct fib6_info *from;
2316 struct rt6_info *nrt6;
2319 from = rcu_dereference(rt6->from);
2320 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2322 rt6_do_update_pmtu(nrt6, mtu);
2323 if (rt6_insert_exception(nrt6, from))
2324 dst_release_immediate(&nrt6->dst);
2330 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2331 struct sk_buff *skb, u32 mtu)
2333 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2336 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337 int oif, u32 mark, kuid_t uid)
2339 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2340 struct dst_entry *dst;
2341 struct flowi6 fl6 = {
2343 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2344 .daddr = iph->daddr,
2345 .saddr = iph->saddr,
2346 .flowlabel = ip6_flowinfo(iph),
2350 dst = ip6_route_output(net, NULL, &fl6);
2352 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2355 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2357 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2359 int oif = sk->sk_bound_dev_if;
2360 struct dst_entry *dst;
2362 if (!oif && skb->dev)
2363 oif = l3mdev_master_ifindex(skb->dev);
2365 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2367 dst = __sk_dst_get(sk);
2368 if (!dst || !dst->obsolete ||
2369 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2374 ip6_datagram_dst_update(sk, false);
2377 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2379 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2380 const struct flowi6 *fl6)
2382 #ifdef CONFIG_IPV6_SUBTREES
2383 struct ipv6_pinfo *np = inet6_sk(sk);
2386 ip6_dst_store(sk, dst,
2387 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2388 &sk->sk_v6_daddr : NULL,
2389 #ifdef CONFIG_IPV6_SUBTREES
2390 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2396 /* Handle redirects */
2397 struct ip6rd_flowi {
2399 struct in6_addr gateway;
2402 static struct rt6_info *__ip6_route_redirect(struct net *net,
2403 struct fib6_table *table,
2405 const struct sk_buff *skb,
2408 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409 struct rt6_info *ret = NULL, *rt_cache;
2410 struct fib6_info *rt;
2411 struct fib6_node *fn;
2413 /* Get the "current" route for this destination and
2414 * check if the redirect has come from appropriate router.
2416 * RFC 4861 specifies that redirects should only be
2417 * accepted if they come from the nexthop to the target.
2418 * Due to the way the routes are chosen, this notion
2419 * is a bit fuzzy and one might need to check all possible
2424 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2426 for_each_fib6_node_rt_rcu(fn) {
2427 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
2429 if (fib6_check_expired(rt))
2431 if (rt->fib6_flags & RTF_REJECT)
2433 if (!rt->fib6_nh.fib_nh_has_gw)
2435 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2437 /* rt_cache's gateway might be different from its 'parent'
2438 * in the case of an ip redirect.
2439 * So we keep searching in the exception table if the gateway
2442 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
2443 rt_cache = rt6_find_cached_rt(rt,
2447 ipv6_addr_equal(&rdfl->gateway,
2448 &rt_cache->rt6i_gateway)) {
2458 rt = net->ipv6.fib6_null_entry;
2459 else if (rt->fib6_flags & RTF_REJECT) {
2460 ret = net->ipv6.ip6_null_entry;
2464 if (rt == net->ipv6.fib6_null_entry) {
2465 fn = fib6_backtrack(fn, &fl6->saddr);
2472 ip6_hold_safe(net, &ret);
2474 ret = ip6_create_rt_rcu(rt);
2478 trace_fib6_table_lookup(net, rt, table, fl6);
2482 static struct dst_entry *ip6_route_redirect(struct net *net,
2483 const struct flowi6 *fl6,
2484 const struct sk_buff *skb,
2485 const struct in6_addr *gateway)
2487 int flags = RT6_LOOKUP_F_HAS_SADDR;
2488 struct ip6rd_flowi rdfl;
2491 rdfl.gateway = *gateway;
2493 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494 flags, __ip6_route_redirect);
2497 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2500 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2501 struct dst_entry *dst;
2502 struct flowi6 fl6 = {
2503 .flowi6_iif = LOOPBACK_IFINDEX,
2505 .flowi6_mark = mark,
2506 .daddr = iph->daddr,
2507 .saddr = iph->saddr,
2508 .flowlabel = ip6_flowinfo(iph),
2512 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2513 rt6_do_redirect(dst, NULL, skb);
2516 EXPORT_SYMBOL_GPL(ip6_redirect);
2518 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2520 const struct ipv6hdr *iph = ipv6_hdr(skb);
2521 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2522 struct dst_entry *dst;
2523 struct flowi6 fl6 = {
2524 .flowi6_iif = LOOPBACK_IFINDEX,
2527 .saddr = iph->daddr,
2528 .flowi6_uid = sock_net_uid(net, NULL),
2531 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2532 rt6_do_redirect(dst, NULL, skb);
2536 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2538 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2541 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2543 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2545 struct net_device *dev = dst->dev;
2546 unsigned int mtu = dst_mtu(dst);
2547 struct net *net = dev_net(dev);
2549 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2551 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2552 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2555 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2556 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2557 * IPV6_MAXPLEN is also valid and means: "any MSS,
2558 * rely only on pmtu discovery"
2560 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2565 static unsigned int ip6_mtu(const struct dst_entry *dst)
2567 struct inet6_dev *idev;
2570 mtu = dst_metric_raw(dst, RTAX_MTU);
2577 idev = __in6_dev_get(dst->dev);
2579 mtu = idev->cnf.mtu6;
2583 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2585 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2589 * 1. mtu on route is locked - use it
2590 * 2. mtu from nexthop exception
2591 * 3. mtu from egress device
2593 * based on ip6_dst_mtu_forward and exception logic of
2594 * rt6_find_cached_rt; called with rcu_read_lock
2596 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2597 struct in6_addr *saddr)
2599 struct rt6_exception_bucket *bucket;
2600 struct rt6_exception *rt6_ex;
2601 struct in6_addr *src_key;
2602 struct inet6_dev *idev;
2605 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2606 mtu = f6i->fib6_pmtu;
2612 #ifdef CONFIG_IPV6_SUBTREES
2613 if (f6i->fib6_src.plen)
2617 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2618 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2619 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2620 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2623 struct net_device *dev = fib6_info_nh_dev(f6i);
2626 idev = __in6_dev_get(dev);
2627 if (idev && idev->cnf.mtu6 > mtu)
2628 mtu = idev->cnf.mtu6;
2631 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2633 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2636 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2639 struct dst_entry *dst;
2640 struct rt6_info *rt;
2641 struct inet6_dev *idev = in6_dev_get(dev);
2642 struct net *net = dev_net(dev);
2644 if (unlikely(!idev))
2645 return ERR_PTR(-ENODEV);
2647 rt = ip6_dst_alloc(net, dev, 0);
2648 if (unlikely(!rt)) {
2650 dst = ERR_PTR(-ENOMEM);
2654 rt->dst.flags |= DST_HOST;
2655 rt->dst.input = ip6_input;
2656 rt->dst.output = ip6_output;
2657 rt->rt6i_gateway = fl6->daddr;
2658 rt->rt6i_dst.addr = fl6->daddr;
2659 rt->rt6i_dst.plen = 128;
2660 rt->rt6i_idev = idev;
2661 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2663 /* Add this dst into uncached_list so that rt6_disable_ip() can
2664 * do proper release of the net_device
2666 rt6_uncached_list_add(rt);
2667 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2669 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2675 static int ip6_dst_gc(struct dst_ops *ops)
2677 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2678 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2679 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2680 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2681 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2682 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2685 entries = dst_entries_get_fast(ops);
2686 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2687 entries <= rt_max_size)
2690 net->ipv6.ip6_rt_gc_expire++;
2691 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2692 entries = dst_entries_get_slow(ops);
2693 if (entries < ops->gc_thresh)
2694 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2696 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2697 return entries > rt_max_size;
2700 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2701 struct fib6_config *cfg,
2702 const struct in6_addr *gw_addr,
2703 u32 tbid, int flags)
2705 struct flowi6 fl6 = {
2706 .flowi6_oif = cfg->fc_ifindex,
2708 .saddr = cfg->fc_prefsrc,
2710 struct fib6_table *table;
2711 struct rt6_info *rt;
2713 table = fib6_get_table(net, tbid);
2717 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2718 flags |= RT6_LOOKUP_F_HAS_SADDR;
2720 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2721 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2723 /* if table lookup failed, fall back to full lookup */
2724 if (rt == net->ipv6.ip6_null_entry) {
2732 static int ip6_route_check_nh_onlink(struct net *net,
2733 struct fib6_config *cfg,
2734 const struct net_device *dev,
2735 struct netlink_ext_ack *extack)
2737 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2738 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2739 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2740 struct fib6_info *from;
2741 struct rt6_info *grt;
2745 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2748 from = rcu_dereference(grt->from);
2749 if (!grt->dst.error &&
2750 /* ignore match if it is the default route */
2751 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2752 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753 NL_SET_ERR_MSG(extack,
2754 "Nexthop has invalid gateway or device mismatch");
2765 static int ip6_route_check_nh(struct net *net,
2766 struct fib6_config *cfg,
2767 struct net_device **_dev,
2768 struct inet6_dev **idev)
2770 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2771 struct net_device *dev = _dev ? *_dev : NULL;
2772 struct rt6_info *grt = NULL;
2773 int err = -EHOSTUNREACH;
2775 if (cfg->fc_table) {
2776 int flags = RT6_LOOKUP_F_IFACE;
2778 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2779 cfg->fc_table, flags);
2781 if (grt->rt6i_flags & RTF_GATEWAY ||
2782 (dev && dev != grt->dst.dev)) {
2790 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2796 if (dev != grt->dst.dev) {
2801 *_dev = dev = grt->dst.dev;
2802 *idev = grt->rt6i_idev;
2804 in6_dev_hold(grt->rt6i_idev);
2807 if (!(grt->rt6i_flags & RTF_GATEWAY))
2816 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2817 struct net_device **_dev, struct inet6_dev **idev,
2818 struct netlink_ext_ack *extack)
2820 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2821 int gwa_type = ipv6_addr_type(gw_addr);
2822 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2823 const struct net_device *dev = *_dev;
2824 bool need_addr_check = !dev;
2827 /* if gw_addr is local we will fail to detect this in case
2828 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2829 * will return already-added prefix route via interface that
2830 * prefix route was assigned to, which might be non-loopback.
2833 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2834 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2838 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2839 /* IPv6 strictly inhibits using not link-local
2840 * addresses as nexthop address.
2841 * Otherwise, router will not able to send redirects.
2842 * It is very good, but in some (rare!) circumstances
2843 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2844 * some exceptions. --ANK
2845 * We allow IPv4-mapped nexthops to support RFC4798-type
2848 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2849 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2853 if (cfg->fc_flags & RTNH_F_ONLINK)
2854 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2856 err = ip6_route_check_nh(net, cfg, _dev, idev);
2862 /* reload in case device was changed */
2867 NL_SET_ERR_MSG(extack, "Egress device not specified");
2869 } else if (dev->flags & IFF_LOOPBACK) {
2870 NL_SET_ERR_MSG(extack,
2871 "Egress device can not be loopback device for this route");
2875 /* if we did not check gw_addr above, do so now that the
2876 * egress device has been resolved.
2878 if (need_addr_check &&
2879 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2880 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2889 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2891 if ((flags & RTF_REJECT) ||
2892 (dev && (dev->flags & IFF_LOOPBACK) &&
2893 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2894 !(flags & RTF_LOCAL)))
2900 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2901 struct fib6_config *cfg, gfp_t gfp_flags,
2902 struct netlink_ext_ack *extack)
2904 struct net_device *dev = NULL;
2905 struct inet6_dev *idev = NULL;
2909 fib6_nh->fib_nh_family = AF_INET6;
2912 if (cfg->fc_ifindex) {
2913 dev = dev_get_by_index(net, cfg->fc_ifindex);
2916 idev = in6_dev_get(dev);
2921 if (cfg->fc_flags & RTNH_F_ONLINK) {
2923 NL_SET_ERR_MSG(extack,
2924 "Nexthop device required for onlink");
2928 if (!(dev->flags & IFF_UP)) {
2929 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2934 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2937 if (cfg->fc_encap) {
2938 struct lwtunnel_state *lwtstate;
2940 err = lwtunnel_build_state(cfg->fc_encap_type,
2941 cfg->fc_encap, AF_INET6, cfg,
2946 fib6_nh->fib_nh_lws = lwtstate_get(lwtstate);
2949 fib6_nh->fib_nh_weight = 1;
2951 /* We cannot add true routes via loopback here,
2952 * they would result in kernel looping; promote them to reject routes
2954 addr_type = ipv6_addr_type(&cfg->fc_dst);
2955 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2956 /* hold loopback dev/idev if we haven't done so. */
2957 if (dev != net->loopback_dev) {
2962 dev = net->loopback_dev;
2964 idev = in6_dev_get(dev);
2973 if (cfg->fc_flags & RTF_GATEWAY) {
2974 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2978 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2979 fib6_nh->fib_nh_has_gw = 1;
2986 if (idev->cnf.disable_ipv6) {
2987 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2992 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
2993 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2998 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2999 !netif_carrier_ok(dev))
3000 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3003 fib6_nh->fib_nh_dev = dev;
3004 fib6_nh->fib_nh_oif = dev->ifindex;
3011 lwtstate_put(fib6_nh->fib_nh_lws);
3012 fib6_nh->fib_nh_lws = NULL;
3020 void fib6_nh_release(struct fib6_nh *fib6_nh)
3022 lwtstate_put(fib6_nh->fib_nh_lws);
3024 if (fib6_nh->fib_nh_dev)
3025 dev_put(fib6_nh->fib_nh_dev);
3028 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3030 struct netlink_ext_ack *extack)
3032 struct net *net = cfg->fc_nlinfo.nl_net;
3033 struct fib6_info *rt = NULL;
3034 struct fib6_table *table;
3038 /* RTF_PCPU is an internal flag; can not be set by userspace */
3039 if (cfg->fc_flags & RTF_PCPU) {
3040 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3044 /* RTF_CACHE is an internal flag; can not be set by userspace */
3045 if (cfg->fc_flags & RTF_CACHE) {
3046 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3050 if (cfg->fc_type > RTN_MAX) {
3051 NL_SET_ERR_MSG(extack, "Invalid route type");
3055 if (cfg->fc_dst_len > 128) {
3056 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3059 if (cfg->fc_src_len > 128) {
3060 NL_SET_ERR_MSG(extack, "Invalid source address length");
3063 #ifndef CONFIG_IPV6_SUBTREES
3064 if (cfg->fc_src_len) {
3065 NL_SET_ERR_MSG(extack,
3066 "Specifying source address requires IPV6_SUBTREES to be enabled");
3072 if (cfg->fc_nlinfo.nlh &&
3073 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3074 table = fib6_get_table(net, cfg->fc_table);
3076 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3077 table = fib6_new_table(net, cfg->fc_table);
3080 table = fib6_new_table(net, cfg->fc_table);
3087 rt = fib6_info_alloc(gfp_flags);
3091 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3093 if (IS_ERR(rt->fib6_metrics)) {
3094 err = PTR_ERR(rt->fib6_metrics);
3095 /* Do not leave garbage there. */
3096 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3100 if (cfg->fc_flags & RTF_ADDRCONF)
3101 rt->dst_nocount = true;
3103 if (cfg->fc_flags & RTF_EXPIRES)
3104 fib6_set_expires(rt, jiffies +
3105 clock_t_to_jiffies(cfg->fc_expires));
3107 fib6_clean_expires(rt);
3109 if (cfg->fc_protocol == RTPROT_UNSPEC)
3110 cfg->fc_protocol = RTPROT_BOOT;
3111 rt->fib6_protocol = cfg->fc_protocol;
3113 rt->fib6_table = table;
3114 rt->fib6_metric = cfg->fc_metric;
3115 rt->fib6_type = cfg->fc_type;
3116 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3118 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3119 rt->fib6_dst.plen = cfg->fc_dst_len;
3120 if (rt->fib6_dst.plen == 128)
3121 rt->dst_host = true;
3123 #ifdef CONFIG_IPV6_SUBTREES
3124 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3125 rt->fib6_src.plen = cfg->fc_src_len;
3127 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3131 /* We cannot add true routes via loopback here,
3132 * they would result in kernel looping; promote them to reject routes
3134 addr_type = ipv6_addr_type(&cfg->fc_dst);
3135 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3136 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3138 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3139 struct net_device *dev = fib6_info_nh_dev(rt);
3141 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3142 NL_SET_ERR_MSG(extack, "Invalid source address");
3146 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3147 rt->fib6_prefsrc.plen = 128;
3149 rt->fib6_prefsrc.plen = 0;
3153 fib6_info_release(rt);
3154 return ERR_PTR(err);
3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3158 struct netlink_ext_ack *extack)
3160 struct fib6_info *rt;
3163 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3167 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3168 fib6_info_release(rt);
3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3175 struct net *net = info->nl_net;
3176 struct fib6_table *table;
3179 if (rt == net->ipv6.fib6_null_entry) {
3184 table = rt->fib6_table;
3185 spin_lock_bh(&table->tb6_lock);
3186 err = fib6_del(rt, info);
3187 spin_unlock_bh(&table->tb6_lock);
3190 fib6_info_release(rt);
3194 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3196 struct nl_info info = { .nl_net = net };
3198 return __ip6_del_rt(rt, &info);
3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3203 struct nl_info *info = &cfg->fc_nlinfo;
3204 struct net *net = info->nl_net;
3205 struct sk_buff *skb = NULL;
3206 struct fib6_table *table;
3209 if (rt == net->ipv6.fib6_null_entry)
3211 table = rt->fib6_table;
3212 spin_lock_bh(&table->tb6_lock);
3214 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3215 struct fib6_info *sibling, *next_sibling;
3217 /* prefer to send a single notification with all hops */
3218 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3220 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3222 if (rt6_fill_node(net, skb, rt, NULL,
3223 NULL, NULL, 0, RTM_DELROUTE,
3224 info->portid, seq, 0) < 0) {
3228 info->skip_notify = 1;
3231 list_for_each_entry_safe(sibling, next_sibling,
3234 err = fib6_del(sibling, info);
3240 err = fib6_del(rt, info);
3242 spin_unlock_bh(&table->tb6_lock);
3244 fib6_info_release(rt);
3247 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3248 info->nlh, gfp_any());
3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3257 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3260 if (cfg->fc_flags & RTF_GATEWAY &&
3261 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3264 rc = rt6_remove_exception_rt(rt);
3269 static int ip6_route_del(struct fib6_config *cfg,
3270 struct netlink_ext_ack *extack)
3272 struct rt6_info *rt_cache;
3273 struct fib6_table *table;
3274 struct fib6_info *rt;
3275 struct fib6_node *fn;
3278 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3280 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3286 fn = fib6_locate(&table->tb6_root,
3287 &cfg->fc_dst, cfg->fc_dst_len,
3288 &cfg->fc_src, cfg->fc_src_len,
3289 !(cfg->fc_flags & RTF_CACHE));
3292 for_each_fib6_node_rt_rcu(fn) {
3295 if (cfg->fc_flags & RTF_CACHE) {
3298 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3301 rc = ip6_del_cached_rt(rt_cache, cfg);
3311 if (cfg->fc_ifindex &&
3313 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3315 if (cfg->fc_flags & RTF_GATEWAY &&
3316 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3318 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3320 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3322 if (!fib6_info_hold_safe(rt))
3326 /* if gateway was specified only delete the one hop */
3327 if (cfg->fc_flags & RTF_GATEWAY)
3328 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3330 return __ip6_del_rt_siblings(rt, cfg);
3338 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3340 struct netevent_redirect netevent;
3341 struct rt6_info *rt, *nrt = NULL;
3342 struct ndisc_options ndopts;
3343 struct inet6_dev *in6_dev;
3344 struct neighbour *neigh;
3345 struct fib6_info *from;
3347 int optlen, on_link;
3350 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3351 optlen -= sizeof(*msg);
3354 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3358 msg = (struct rd_msg *)icmp6_hdr(skb);
3360 if (ipv6_addr_is_multicast(&msg->dest)) {
3361 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3366 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3368 } else if (ipv6_addr_type(&msg->target) !=
3369 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3370 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3374 in6_dev = __in6_dev_get(skb->dev);
3377 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3381 * The IP source address of the Redirect MUST be the same as the current
3382 * first-hop router for the specified ICMP Destination Address.
3385 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3386 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3391 if (ndopts.nd_opts_tgt_lladdr) {
3392 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3395 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3400 rt = (struct rt6_info *) dst;
3401 if (rt->rt6i_flags & RTF_REJECT) {
3402 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3406 /* Redirect received -> path was valid.
3407 * Look, redirects are sent only in response to data packets,
3408 * so that this nexthop apparently is reachable. --ANK
3410 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3412 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3417 * We have finally decided to accept it.
3420 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3421 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3422 NEIGH_UPDATE_F_OVERRIDE|
3423 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3424 NEIGH_UPDATE_F_ISROUTER)),
3425 NDISC_REDIRECT, &ndopts);
3428 from = rcu_dereference(rt->from);
3429 /* This fib6_info_hold() is safe here because we hold reference to rt
3430 * and rt already holds reference to fib6_info.
3432 fib6_info_hold(from);
3435 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3439 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3441 nrt->rt6i_flags &= ~RTF_GATEWAY;
3443 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3445 /* No need to remove rt from the exception table if rt is
3446 * a cached route because rt6_insert_exception() will
3449 if (rt6_insert_exception(nrt, from)) {
3450 dst_release_immediate(&nrt->dst);
3454 netevent.old = &rt->dst;
3455 netevent.new = &nrt->dst;
3456 netevent.daddr = &msg->dest;
3457 netevent.neigh = neigh;
3458 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3461 fib6_info_release(from);
3462 neigh_release(neigh);
3465 #ifdef CONFIG_IPV6_ROUTE_INFO
3466 static struct fib6_info *rt6_get_route_info(struct net *net,
3467 const struct in6_addr *prefix, int prefixlen,
3468 const struct in6_addr *gwaddr,
3469 struct net_device *dev)
3471 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3472 int ifindex = dev->ifindex;
3473 struct fib6_node *fn;
3474 struct fib6_info *rt = NULL;
3475 struct fib6_table *table;
3477 table = fib6_get_table(net, tb_id);
3482 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3486 for_each_fib6_node_rt_rcu(fn) {
3487 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3489 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3490 !rt->fib6_nh.fib_nh_has_gw)
3492 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3494 if (!fib6_info_hold_safe(rt))
3503 static struct fib6_info *rt6_add_route_info(struct net *net,
3504 const struct in6_addr *prefix, int prefixlen,
3505 const struct in6_addr *gwaddr,
3506 struct net_device *dev,
3509 struct fib6_config cfg = {
3510 .fc_metric = IP6_RT_PRIO_USER,
3511 .fc_ifindex = dev->ifindex,
3512 .fc_dst_len = prefixlen,
3513 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3514 RTF_UP | RTF_PREF(pref),
3515 .fc_protocol = RTPROT_RA,
3516 .fc_type = RTN_UNICAST,
3517 .fc_nlinfo.portid = 0,
3518 .fc_nlinfo.nlh = NULL,
3519 .fc_nlinfo.nl_net = net,
3522 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3523 cfg.fc_dst = *prefix;
3524 cfg.fc_gateway = *gwaddr;
3526 /* We should treat it as a default route if prefix length is 0. */
3528 cfg.fc_flags |= RTF_DEFAULT;
3530 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3532 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3536 struct fib6_info *rt6_get_dflt_router(struct net *net,
3537 const struct in6_addr *addr,
3538 struct net_device *dev)
3540 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3541 struct fib6_info *rt;
3542 struct fib6_table *table;
3544 table = fib6_get_table(net, tb_id);
3549 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3550 struct fib6_nh *nh = &rt->fib6_nh;
3552 if (dev == nh->fib_nh_dev &&
3553 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3554 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3557 if (rt && !fib6_info_hold_safe(rt))
3563 struct fib6_info *rt6_add_dflt_router(struct net *net,
3564 const struct in6_addr *gwaddr,
3565 struct net_device *dev,
3568 struct fib6_config cfg = {
3569 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3570 .fc_metric = IP6_RT_PRIO_USER,
3571 .fc_ifindex = dev->ifindex,
3572 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3573 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3574 .fc_protocol = RTPROT_RA,
3575 .fc_type = RTN_UNICAST,
3576 .fc_nlinfo.portid = 0,
3577 .fc_nlinfo.nlh = NULL,
3578 .fc_nlinfo.nl_net = net,
3581 cfg.fc_gateway = *gwaddr;
3583 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3584 struct fib6_table *table;
3586 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3588 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3591 return rt6_get_dflt_router(net, gwaddr, dev);
3594 static void __rt6_purge_dflt_routers(struct net *net,
3595 struct fib6_table *table)
3597 struct fib6_info *rt;
3601 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3602 struct net_device *dev = fib6_info_nh_dev(rt);
3603 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3605 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3606 (!idev || idev->cnf.accept_ra != 2) &&
3607 fib6_info_hold_safe(rt)) {
3609 ip6_del_rt(net, rt);
3615 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3618 void rt6_purge_dflt_routers(struct net *net)
3620 struct fib6_table *table;
3621 struct hlist_head *head;
3626 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3627 head = &net->ipv6.fib_table_hash[h];
3628 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3629 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3630 __rt6_purge_dflt_routers(net, table);
3637 static void rtmsg_to_fib6_config(struct net *net,
3638 struct in6_rtmsg *rtmsg,
3639 struct fib6_config *cfg)
3641 *cfg = (struct fib6_config){
3642 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3644 .fc_ifindex = rtmsg->rtmsg_ifindex,
3645 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3646 .fc_expires = rtmsg->rtmsg_info,
3647 .fc_dst_len = rtmsg->rtmsg_dst_len,
3648 .fc_src_len = rtmsg->rtmsg_src_len,
3649 .fc_flags = rtmsg->rtmsg_flags,
3650 .fc_type = rtmsg->rtmsg_type,
3652 .fc_nlinfo.nl_net = net,
3654 .fc_dst = rtmsg->rtmsg_dst,
3655 .fc_src = rtmsg->rtmsg_src,
3656 .fc_gateway = rtmsg->rtmsg_gateway,
3660 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3662 struct fib6_config cfg;
3663 struct in6_rtmsg rtmsg;
3667 case SIOCADDRT: /* Add a route */
3668 case SIOCDELRT: /* Delete a route */
3669 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3671 err = copy_from_user(&rtmsg, arg,
3672 sizeof(struct in6_rtmsg));
3676 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3681 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3684 err = ip6_route_del(&cfg, NULL);
3698 * Drop the packet on the floor
3701 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3704 struct dst_entry *dst = skb_dst(skb);
3705 switch (ipstats_mib_noroutes) {
3706 case IPSTATS_MIB_INNOROUTES:
3707 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3708 if (type == IPV6_ADDR_ANY) {
3709 IP6_INC_STATS(dev_net(dst->dev),
3710 __in6_dev_get_safely(skb->dev),
3711 IPSTATS_MIB_INADDRERRORS);
3715 case IPSTATS_MIB_OUTNOROUTES:
3716 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3717 ipstats_mib_noroutes);
3720 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3725 static int ip6_pkt_discard(struct sk_buff *skb)
3727 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3730 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3732 skb->dev = skb_dst(skb)->dev;
3733 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3736 static int ip6_pkt_prohibit(struct sk_buff *skb)
3738 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3741 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3743 skb->dev = skb_dst(skb)->dev;
3744 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3748 * Allocate a dst for local (unicast / anycast) address.
3751 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3752 struct inet6_dev *idev,
3753 const struct in6_addr *addr,
3754 bool anycast, gfp_t gfp_flags)
3756 struct fib6_config cfg = {
3757 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3758 .fc_ifindex = idev->dev->ifindex,
3759 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3762 .fc_protocol = RTPROT_KERNEL,
3763 .fc_nlinfo.nl_net = net,
3764 .fc_ignore_dev_down = true,
3768 cfg.fc_type = RTN_ANYCAST;
3769 cfg.fc_flags |= RTF_ANYCAST;
3771 cfg.fc_type = RTN_LOCAL;
3772 cfg.fc_flags |= RTF_LOCAL;
3775 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3778 /* remove deleted ip from prefsrc entries */
3779 struct arg_dev_net_ip {
3780 struct net_device *dev;
3782 struct in6_addr *addr;
3785 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3787 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3788 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3789 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3791 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3792 rt != net->ipv6.fib6_null_entry &&
3793 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3794 spin_lock_bh(&rt6_exception_lock);
3795 /* remove prefsrc entry */
3796 rt->fib6_prefsrc.plen = 0;
3797 spin_unlock_bh(&rt6_exception_lock);
3802 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3804 struct net *net = dev_net(ifp->idev->dev);
3805 struct arg_dev_net_ip adni = {
3806 .dev = ifp->idev->dev,
3810 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3813 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3815 /* Remove routers and update dst entries when gateway turn into host. */
3816 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3818 struct in6_addr *gateway = (struct in6_addr *)arg;
3820 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3821 rt->fib6_nh.fib_nh_has_gw &&
3822 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3826 /* Further clean up cached routes in exception table.
3827 * This is needed because cached route may have a different
3828 * gateway than its 'parent' in the case of an ip redirect.
3830 rt6_exceptions_clean_tohost(rt, gateway);
3835 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3837 fib6_clean_all(net, fib6_clean_tohost, gateway);
3840 struct arg_netdev_event {
3841 const struct net_device *dev;
3843 unsigned int nh_flags;
3844 unsigned long event;
3848 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3850 struct fib6_info *iter;
3851 struct fib6_node *fn;
3853 fn = rcu_dereference_protected(rt->fib6_node,
3854 lockdep_is_held(&rt->fib6_table->tb6_lock));
3855 iter = rcu_dereference_protected(fn->leaf,
3856 lockdep_is_held(&rt->fib6_table->tb6_lock));
3858 if (iter->fib6_metric == rt->fib6_metric &&
3859 rt6_qualify_for_ecmp(iter))
3861 iter = rcu_dereference_protected(iter->fib6_next,
3862 lockdep_is_held(&rt->fib6_table->tb6_lock));
3868 static bool rt6_is_dead(const struct fib6_info *rt)
3870 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3871 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3872 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3878 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3880 struct fib6_info *iter;
3883 if (!rt6_is_dead(rt))
3884 total += rt->fib6_nh.fib_nh_weight;
3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3887 if (!rt6_is_dead(iter))
3888 total += iter->fib6_nh.fib_nh_weight;
3894 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3896 int upper_bound = -1;
3898 if (!rt6_is_dead(rt)) {
3899 *weight += rt->fib6_nh.fib_nh_weight;
3900 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3903 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3906 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3908 struct fib6_info *iter;
3911 rt6_upper_bound_set(rt, &weight, total);
3913 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3914 rt6_upper_bound_set(iter, &weight, total);
3917 void rt6_multipath_rebalance(struct fib6_info *rt)
3919 struct fib6_info *first;
3922 /* In case the entire multipath route was marked for flushing,
3923 * then there is no need to rebalance upon the removal of every
3926 if (!rt->fib6_nsiblings || rt->should_flush)
3929 /* During lookup routes are evaluated in order, so we need to
3930 * make sure upper bounds are assigned from the first sibling
3933 first = rt6_multipath_first_sibling(rt);
3934 if (WARN_ON_ONCE(!first))
3937 total = rt6_multipath_total_weight(first);
3938 rt6_multipath_upper_bound_set(first, total);
3941 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3943 const struct arg_netdev_event *arg = p_arg;
3944 struct net *net = dev_net(arg->dev);
3946 if (rt != net->ipv6.fib6_null_entry &&
3947 rt->fib6_nh.fib_nh_dev == arg->dev) {
3948 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3949 fib6_update_sernum_upto_root(net, rt);
3950 rt6_multipath_rebalance(rt);
3956 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3958 struct arg_netdev_event arg = {
3961 .nh_flags = nh_flags,
3965 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3966 arg.nh_flags |= RTNH_F_LINKDOWN;
3968 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3971 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3972 const struct net_device *dev)
3974 struct fib6_info *iter;
3976 if (rt->fib6_nh.fib_nh_dev == dev)
3978 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3979 if (iter->fib6_nh.fib_nh_dev == dev)
3985 static void rt6_multipath_flush(struct fib6_info *rt)
3987 struct fib6_info *iter;
3989 rt->should_flush = 1;
3990 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3991 iter->should_flush = 1;
3994 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3995 const struct net_device *down_dev)
3997 struct fib6_info *iter;
3998 unsigned int dead = 0;
4000 if (rt->fib6_nh.fib_nh_dev == down_dev ||
4001 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4003 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4004 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4005 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4011 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4012 const struct net_device *dev,
4013 unsigned int nh_flags)
4015 struct fib6_info *iter;
4017 if (rt->fib6_nh.fib_nh_dev == dev)
4018 rt->fib6_nh.fib_nh_flags |= nh_flags;
4019 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4020 if (iter->fib6_nh.fib_nh_dev == dev)
4021 iter->fib6_nh.fib_nh_flags |= nh_flags;
4024 /* called with write lock held for table with rt */
4025 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4027 const struct arg_netdev_event *arg = p_arg;
4028 const struct net_device *dev = arg->dev;
4029 struct net *net = dev_net(dev);
4031 if (rt == net->ipv6.fib6_null_entry)
4034 switch (arg->event) {
4035 case NETDEV_UNREGISTER:
4036 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4038 if (rt->should_flush)
4040 if (!rt->fib6_nsiblings)
4041 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4042 if (rt6_multipath_uses_dev(rt, dev)) {
4045 count = rt6_multipath_dead_count(rt, dev);
4046 if (rt->fib6_nsiblings + 1 == count) {
4047 rt6_multipath_flush(rt);
4050 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4052 fib6_update_sernum(net, rt);
4053 rt6_multipath_rebalance(rt);
4057 if (rt->fib6_nh.fib_nh_dev != dev ||
4058 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4060 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4061 rt6_multipath_rebalance(rt);
4068 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4070 struct arg_netdev_event arg = {
4076 struct net *net = dev_net(dev);
4078 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4079 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4081 fib6_clean_all(net, fib6_ifdown, &arg);
4084 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4086 rt6_sync_down_dev(dev, event);
4087 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4088 neigh_ifdown(&nd_tbl, dev);
4091 struct rt6_mtu_change_arg {
4092 struct net_device *dev;
4096 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4098 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4099 struct inet6_dev *idev;
4101 /* In IPv6 pmtu discovery is not optional,
4102 so that RTAX_MTU lock cannot disable it.
4103 We still use this lock to block changes
4104 caused by addrconf/ndisc.
4107 idev = __in6_dev_get(arg->dev);
4111 /* For administrative MTU increase, there is no way to discover
4112 IPv6 PMTU increase, so PMTU increase should be updated here.
4113 Since RFC 1981 doesn't include administrative MTU increase
4114 update PMTU increase is a MUST. (i.e. jumbo frame)
4116 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4117 !fib6_metric_locked(rt, RTAX_MTU)) {
4118 u32 mtu = rt->fib6_pmtu;
4120 if (mtu >= arg->mtu ||
4121 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4122 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4124 spin_lock_bh(&rt6_exception_lock);
4125 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4126 spin_unlock_bh(&rt6_exception_lock);
4131 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4133 struct rt6_mtu_change_arg arg = {
4138 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4141 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4142 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4143 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4144 [RTA_OIF] = { .type = NLA_U32 },
4145 [RTA_IIF] = { .type = NLA_U32 },
4146 [RTA_PRIORITY] = { .type = NLA_U32 },
4147 [RTA_METRICS] = { .type = NLA_NESTED },
4148 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4149 [RTA_PREF] = { .type = NLA_U8 },
4150 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4151 [RTA_ENCAP] = { .type = NLA_NESTED },
4152 [RTA_EXPIRES] = { .type = NLA_U32 },
4153 [RTA_UID] = { .type = NLA_U32 },
4154 [RTA_MARK] = { .type = NLA_U32 },
4155 [RTA_TABLE] = { .type = NLA_U32 },
4156 [RTA_IP_PROTO] = { .type = NLA_U8 },
4157 [RTA_SPORT] = { .type = NLA_U16 },
4158 [RTA_DPORT] = { .type = NLA_U16 },
4161 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4162 struct fib6_config *cfg,
4163 struct netlink_ext_ack *extack)
4166 struct nlattr *tb[RTA_MAX+1];
4170 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4176 rtm = nlmsg_data(nlh);
4178 *cfg = (struct fib6_config){
4179 .fc_table = rtm->rtm_table,
4180 .fc_dst_len = rtm->rtm_dst_len,
4181 .fc_src_len = rtm->rtm_src_len,
4183 .fc_protocol = rtm->rtm_protocol,
4184 .fc_type = rtm->rtm_type,
4186 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4187 .fc_nlinfo.nlh = nlh,
4188 .fc_nlinfo.nl_net = sock_net(skb->sk),
4191 if (rtm->rtm_type == RTN_UNREACHABLE ||
4192 rtm->rtm_type == RTN_BLACKHOLE ||
4193 rtm->rtm_type == RTN_PROHIBIT ||
4194 rtm->rtm_type == RTN_THROW)
4195 cfg->fc_flags |= RTF_REJECT;
4197 if (rtm->rtm_type == RTN_LOCAL)
4198 cfg->fc_flags |= RTF_LOCAL;
4200 if (rtm->rtm_flags & RTM_F_CLONED)
4201 cfg->fc_flags |= RTF_CACHE;
4203 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4205 if (tb[RTA_GATEWAY]) {
4206 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4207 cfg->fc_flags |= RTF_GATEWAY;
4210 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4215 int plen = (rtm->rtm_dst_len + 7) >> 3;
4217 if (nla_len(tb[RTA_DST]) < plen)
4220 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4224 int plen = (rtm->rtm_src_len + 7) >> 3;
4226 if (nla_len(tb[RTA_SRC]) < plen)
4229 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4232 if (tb[RTA_PREFSRC])
4233 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4236 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4238 if (tb[RTA_PRIORITY])
4239 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4241 if (tb[RTA_METRICS]) {
4242 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4243 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4247 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4249 if (tb[RTA_MULTIPATH]) {
4250 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4251 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4253 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4254 cfg->fc_mp_len, extack);
4260 pref = nla_get_u8(tb[RTA_PREF]);
4261 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4262 pref != ICMPV6_ROUTER_PREF_HIGH)
4263 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4264 cfg->fc_flags |= RTF_PREF(pref);
4268 cfg->fc_encap = tb[RTA_ENCAP];
4270 if (tb[RTA_ENCAP_TYPE]) {
4271 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4273 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4278 if (tb[RTA_EXPIRES]) {
4279 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4281 if (addrconf_finite_timeout(timeout)) {
4282 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4283 cfg->fc_flags |= RTF_EXPIRES;
4293 struct fib6_info *fib6_info;
4294 struct fib6_config r_cfg;
4295 struct list_head next;
4298 static int ip6_route_info_append(struct net *net,
4299 struct list_head *rt6_nh_list,
4300 struct fib6_info *rt,
4301 struct fib6_config *r_cfg)
4306 list_for_each_entry(nh, rt6_nh_list, next) {
4307 /* check if fib6_info already exists */
4308 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4312 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4316 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4317 list_add_tail(&nh->next, rt6_nh_list);
4322 static void ip6_route_mpath_notify(struct fib6_info *rt,
4323 struct fib6_info *rt_last,
4324 struct nl_info *info,
4327 /* if this is an APPEND route, then rt points to the first route
4328 * inserted and rt_last points to last route inserted. Userspace
4329 * wants a consistent dump of the route which starts at the first
4330 * nexthop. Since sibling routes are always added at the end of
4331 * the list, find the first sibling of the last route appended
4333 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4334 rt = list_first_entry(&rt_last->fib6_siblings,
4340 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4343 static int ip6_route_multipath_add(struct fib6_config *cfg,
4344 struct netlink_ext_ack *extack)
4346 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4347 struct nl_info *info = &cfg->fc_nlinfo;
4348 struct fib6_config r_cfg;
4349 struct rtnexthop *rtnh;
4350 struct fib6_info *rt;
4351 struct rt6_nh *err_nh;
4352 struct rt6_nh *nh, *nh_safe;
4358 int replace = (cfg->fc_nlinfo.nlh &&
4359 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4360 LIST_HEAD(rt6_nh_list);
4362 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4363 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4364 nlflags |= NLM_F_APPEND;
4366 remaining = cfg->fc_mp_len;
4367 rtnh = (struct rtnexthop *)cfg->fc_mp;
4369 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4370 * fib6_info structs per nexthop
4372 while (rtnh_ok(rtnh, remaining)) {
4373 memcpy(&r_cfg, cfg, sizeof(*cfg));
4374 if (rtnh->rtnh_ifindex)
4375 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4377 attrlen = rtnh_attrlen(rtnh);
4379 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4381 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4383 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4384 r_cfg.fc_flags |= RTF_GATEWAY;
4386 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4387 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4389 r_cfg.fc_encap_type = nla_get_u16(nla);
4392 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4393 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4399 if (!rt6_qualify_for_ecmp(rt)) {
4401 NL_SET_ERR_MSG(extack,
4402 "Device only routes can not be added for IPv6 using the multipath API.");
4403 fib6_info_release(rt);
4407 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4409 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4412 fib6_info_release(rt);
4416 rtnh = rtnh_next(rtnh, &remaining);
4419 /* for add and replace send one notification with all nexthops.
4420 * Skip the notification in fib6_add_rt2node and send one with
4421 * the full route when done
4423 info->skip_notify = 1;
4426 list_for_each_entry(nh, &rt6_nh_list, next) {
4427 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4428 fib6_info_release(nh->fib6_info);
4431 /* save reference to last route successfully inserted */
4432 rt_last = nh->fib6_info;
4434 /* save reference to first route for notification */
4436 rt_notif = nh->fib6_info;
4439 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4440 nh->fib6_info = NULL;
4443 NL_SET_ERR_MSG_MOD(extack,
4444 "multipath route replace failed (check consistency of installed routes)");
4449 /* Because each route is added like a single route we remove
4450 * these flags after the first nexthop: if there is a collision,
4451 * we have already failed to add the first nexthop:
4452 * fib6_add_rt2node() has rejected it; when replacing, old
4453 * nexthops have been replaced by first new, the rest should
4456 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4461 /* success ... tell user about new route */
4462 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4466 /* send notification for routes that were added so that
4467 * the delete notifications sent by ip6_route_del are
4471 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4473 /* Delete routes that were already added */
4474 list_for_each_entry(nh, &rt6_nh_list, next) {
4477 ip6_route_del(&nh->r_cfg, extack);
4481 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4483 fib6_info_release(nh->fib6_info);
4484 list_del(&nh->next);
4491 static int ip6_route_multipath_del(struct fib6_config *cfg,
4492 struct netlink_ext_ack *extack)
4494 struct fib6_config r_cfg;
4495 struct rtnexthop *rtnh;
4498 int err = 1, last_err = 0;
4500 remaining = cfg->fc_mp_len;
4501 rtnh = (struct rtnexthop *)cfg->fc_mp;
4503 /* Parse a Multipath Entry */
4504 while (rtnh_ok(rtnh, remaining)) {
4505 memcpy(&r_cfg, cfg, sizeof(*cfg));
4506 if (rtnh->rtnh_ifindex)
4507 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4509 attrlen = rtnh_attrlen(rtnh);
4511 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4513 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4515 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4516 r_cfg.fc_flags |= RTF_GATEWAY;
4519 err = ip6_route_del(&r_cfg, extack);
4523 rtnh = rtnh_next(rtnh, &remaining);
4529 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4530 struct netlink_ext_ack *extack)
4532 struct fib6_config cfg;
4535 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4540 return ip6_route_multipath_del(&cfg, extack);
4542 cfg.fc_delete_all_nh = 1;
4543 return ip6_route_del(&cfg, extack);
4547 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4548 struct netlink_ext_ack *extack)
4550 struct fib6_config cfg;
4553 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4557 if (cfg.fc_metric == 0)
4558 cfg.fc_metric = IP6_RT_PRIO_USER;
4561 return ip6_route_multipath_add(&cfg, extack);
4563 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4566 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4568 int nexthop_len = 0;
4570 if (rt->fib6_nsiblings) {
4571 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4572 + NLA_ALIGN(sizeof(struct rtnexthop))
4573 + nla_total_size(16) /* RTA_GATEWAY */
4574 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4576 nexthop_len *= rt->fib6_nsiblings;
4579 return NLMSG_ALIGN(sizeof(struct rtmsg))
4580 + nla_total_size(16) /* RTA_SRC */
4581 + nla_total_size(16) /* RTA_DST */
4582 + nla_total_size(16) /* RTA_GATEWAY */
4583 + nla_total_size(16) /* RTA_PREFSRC */
4584 + nla_total_size(4) /* RTA_TABLE */
4585 + nla_total_size(4) /* RTA_IIF */
4586 + nla_total_size(4) /* RTA_OIF */
4587 + nla_total_size(4) /* RTA_PRIORITY */
4588 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4589 + nla_total_size(sizeof(struct rta_cacheinfo))
4590 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4591 + nla_total_size(1) /* RTA_PREF */
4592 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4596 static int rt6_nexthop_info(struct sk_buff *skb, const struct fib6_nh *fib6_nh,
4597 unsigned int *flags, bool skip_oif)
4599 if (fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4600 *flags |= RTNH_F_DEAD;
4602 if (fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
4603 *flags |= RTNH_F_LINKDOWN;
4606 if (ip6_ignore_linkdown(fib6_nh->fib_nh_dev))
4607 *flags |= RTNH_F_DEAD;
4611 if (fib6_nh->fib_nh_has_gw) {
4612 if (nla_put_in6_addr(skb, RTA_GATEWAY, &fib6_nh->fib_nh_gw6) < 0)
4613 goto nla_put_failure;
4616 *flags |= (fib6_nh->fib_nh_flags & RTNH_F_ONLINK);
4617 if (fib6_nh->fib_nh_flags & RTNH_F_OFFLOAD)
4618 *flags |= RTNH_F_OFFLOAD;
4620 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4621 if (!skip_oif && fib6_nh->fib_nh_dev &&
4622 nla_put_u32(skb, RTA_OIF, fib6_nh->fib_nh_dev->ifindex))
4623 goto nla_put_failure;
4625 if (fib6_nh->fib_nh_lws &&
4626 lwtunnel_fill_encap(skb, fib6_nh->fib_nh_lws) < 0)
4627 goto nla_put_failure;
4635 /* add multipath next hop */
4636 static int rt6_add_nexthop(struct sk_buff *skb, const struct fib6_nh *fib6_nh)
4638 const struct net_device *dev = fib6_nh->fib_nh_dev;
4639 struct rtnexthop *rtnh;
4640 unsigned int flags = 0;
4642 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4644 goto nla_put_failure;
4646 rtnh->rtnh_hops = fib6_nh->fib_nh_weight - 1;
4647 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4649 if (rt6_nexthop_info(skb, fib6_nh, &flags, true) < 0)
4650 goto nla_put_failure;
4652 rtnh->rtnh_flags = flags;
4654 /* length of rtnetlink header + attributes */
4655 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4663 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4664 struct fib6_info *rt, struct dst_entry *dst,
4665 struct in6_addr *dest, struct in6_addr *src,
4666 int iif, int type, u32 portid, u32 seq,
4669 struct rt6_info *rt6 = (struct rt6_info *)dst;
4670 struct rt6key *rt6_dst, *rt6_src;
4671 u32 *pmetrics, table, rt6_flags;
4672 struct nlmsghdr *nlh;
4676 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4681 rt6_dst = &rt6->rt6i_dst;
4682 rt6_src = &rt6->rt6i_src;
4683 rt6_flags = rt6->rt6i_flags;
4685 rt6_dst = &rt->fib6_dst;
4686 rt6_src = &rt->fib6_src;
4687 rt6_flags = rt->fib6_flags;
4690 rtm = nlmsg_data(nlh);
4691 rtm->rtm_family = AF_INET6;
4692 rtm->rtm_dst_len = rt6_dst->plen;
4693 rtm->rtm_src_len = rt6_src->plen;
4696 table = rt->fib6_table->tb6_id;
4698 table = RT6_TABLE_UNSPEC;
4699 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4700 if (nla_put_u32(skb, RTA_TABLE, table))
4701 goto nla_put_failure;
4703 rtm->rtm_type = rt->fib6_type;
4705 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4706 rtm->rtm_protocol = rt->fib6_protocol;
4708 if (rt6_flags & RTF_CACHE)
4709 rtm->rtm_flags |= RTM_F_CLONED;
4712 if (nla_put_in6_addr(skb, RTA_DST, dest))
4713 goto nla_put_failure;
4714 rtm->rtm_dst_len = 128;
4715 } else if (rtm->rtm_dst_len)
4716 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4717 goto nla_put_failure;
4718 #ifdef CONFIG_IPV6_SUBTREES
4720 if (nla_put_in6_addr(skb, RTA_SRC, src))
4721 goto nla_put_failure;
4722 rtm->rtm_src_len = 128;
4723 } else if (rtm->rtm_src_len &&
4724 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4725 goto nla_put_failure;
4728 #ifdef CONFIG_IPV6_MROUTE
4729 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4730 int err = ip6mr_get_route(net, skb, rtm, portid);
4735 goto nla_put_failure;
4738 if (nla_put_u32(skb, RTA_IIF, iif))
4739 goto nla_put_failure;
4741 struct in6_addr saddr_buf;
4742 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4743 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4744 goto nla_put_failure;
4747 if (rt->fib6_prefsrc.plen) {
4748 struct in6_addr saddr_buf;
4749 saddr_buf = rt->fib6_prefsrc.addr;
4750 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4751 goto nla_put_failure;
4754 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4755 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4756 goto nla_put_failure;
4758 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4759 goto nla_put_failure;
4761 /* For multipath routes, walk the siblings list and add
4762 * each as a nexthop within RTA_MULTIPATH.
4765 if (rt6_flags & RTF_GATEWAY &&
4766 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4767 goto nla_put_failure;
4769 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4770 goto nla_put_failure;
4771 } else if (rt->fib6_nsiblings) {
4772 struct fib6_info *sibling, *next_sibling;
4775 mp = nla_nest_start(skb, RTA_MULTIPATH);
4777 goto nla_put_failure;
4779 if (rt6_add_nexthop(skb, &rt->fib6_nh) < 0)
4780 goto nla_put_failure;
4782 list_for_each_entry_safe(sibling, next_sibling,
4783 &rt->fib6_siblings, fib6_siblings) {
4784 if (rt6_add_nexthop(skb, &sibling->fib6_nh) < 0)
4785 goto nla_put_failure;
4788 nla_nest_end(skb, mp);
4790 if (rt6_nexthop_info(skb, &rt->fib6_nh, &rtm->rtm_flags,
4792 goto nla_put_failure;
4795 if (rt6_flags & RTF_EXPIRES) {
4796 expires = dst ? dst->expires : rt->expires;
4800 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4801 goto nla_put_failure;
4803 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4804 goto nla_put_failure;
4807 nlmsg_end(skb, nlh);
4811 nlmsg_cancel(skb, nlh);
4815 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4816 const struct net_device *dev)
4818 if (f6i->fib6_nh.fib_nh_dev == dev)
4821 if (f6i->fib6_nsiblings) {
4822 struct fib6_info *sibling, *next_sibling;
4824 list_for_each_entry_safe(sibling, next_sibling,
4825 &f6i->fib6_siblings, fib6_siblings) {
4826 if (sibling->fib6_nh.fib_nh_dev == dev)
4834 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4836 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4837 struct fib_dump_filter *filter = &arg->filter;
4838 unsigned int flags = NLM_F_MULTI;
4839 struct net *net = arg->net;
4841 if (rt == net->ipv6.fib6_null_entry)
4844 if ((filter->flags & RTM_F_PREFIX) &&
4845 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4846 /* success since this is not a prefix route */
4849 if (filter->filter_set) {
4850 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4851 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4852 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4855 flags |= NLM_F_DUMP_FILTERED;
4858 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4859 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4860 arg->cb->nlh->nlmsg_seq, flags);
4863 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4864 const struct nlmsghdr *nlh,
4866 struct netlink_ext_ack *extack)
4871 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4872 NL_SET_ERR_MSG_MOD(extack,
4873 "Invalid header for get route request");
4877 if (!netlink_strict_get_check(skb))
4878 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4879 rtm_ipv6_policy, extack);
4881 rtm = nlmsg_data(nlh);
4882 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4883 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4884 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4886 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4889 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4890 NL_SET_ERR_MSG_MOD(extack,
4891 "Invalid flags for get route request");
4895 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4896 rtm_ipv6_policy, extack);
4900 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4901 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4902 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4906 for (i = 0; i <= RTA_MAX; i++) {
4922 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4930 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4931 struct netlink_ext_ack *extack)
4933 struct net *net = sock_net(in_skb->sk);
4934 struct nlattr *tb[RTA_MAX+1];
4935 int err, iif = 0, oif = 0;
4936 struct fib6_info *from;
4937 struct dst_entry *dst;
4938 struct rt6_info *rt;
4939 struct sk_buff *skb;
4941 struct flowi6 fl6 = {};
4944 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4949 rtm = nlmsg_data(nlh);
4950 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4951 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4954 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4957 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4961 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4964 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4968 iif = nla_get_u32(tb[RTA_IIF]);
4971 oif = nla_get_u32(tb[RTA_OIF]);
4974 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4977 fl6.flowi6_uid = make_kuid(current_user_ns(),
4978 nla_get_u32(tb[RTA_UID]));
4980 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4983 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4986 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4988 if (tb[RTA_IP_PROTO]) {
4989 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4990 &fl6.flowi6_proto, AF_INET6,
4997 struct net_device *dev;
5002 dev = dev_get_by_index_rcu(net, iif);
5009 fl6.flowi6_iif = iif;
5011 if (!ipv6_addr_any(&fl6.saddr))
5012 flags |= RT6_LOOKUP_F_HAS_SADDR;
5014 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5018 fl6.flowi6_oif = oif;
5020 dst = ip6_route_output(net, NULL, &fl6);
5024 rt = container_of(dst, struct rt6_info, dst);
5025 if (rt->dst.error) {
5026 err = rt->dst.error;
5031 if (rt == net->ipv6.ip6_null_entry) {
5032 err = rt->dst.error;
5037 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5044 skb_dst_set(skb, &rt->dst);
5047 from = rcu_dereference(rt->from);
5050 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5051 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5054 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5055 &fl6.saddr, iif, RTM_NEWROUTE,
5056 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5065 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5070 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5071 unsigned int nlm_flags)
5073 struct sk_buff *skb;
5074 struct net *net = info->nl_net;
5079 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5081 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5085 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5086 event, info->portid, seq, nlm_flags);
5088 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5089 WARN_ON(err == -EMSGSIZE);
5093 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5094 info->nlh, gfp_any());
5098 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5101 static int ip6_route_dev_notify(struct notifier_block *this,
5102 unsigned long event, void *ptr)
5104 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5105 struct net *net = dev_net(dev);
5107 if (!(dev->flags & IFF_LOOPBACK))
5110 if (event == NETDEV_REGISTER) {
5111 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5112 net->ipv6.ip6_null_entry->dst.dev = dev;
5113 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5115 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5116 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5117 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5118 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5120 } else if (event == NETDEV_UNREGISTER &&
5121 dev->reg_state != NETREG_UNREGISTERED) {
5122 /* NETDEV_UNREGISTER could be fired for multiple times by
5123 * netdev_wait_allrefs(). Make sure we only call this once.
5125 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5126 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5127 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5128 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5139 #ifdef CONFIG_PROC_FS
5140 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5142 struct net *net = (struct net *)seq->private;
5143 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5144 net->ipv6.rt6_stats->fib_nodes,
5145 net->ipv6.rt6_stats->fib_route_nodes,
5146 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5147 net->ipv6.rt6_stats->fib_rt_entries,
5148 net->ipv6.rt6_stats->fib_rt_cache,
5149 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5150 net->ipv6.rt6_stats->fib_discarded_routes);
5154 #endif /* CONFIG_PROC_FS */
5156 #ifdef CONFIG_SYSCTL
5159 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5160 void __user *buffer, size_t *lenp, loff_t *ppos)
5168 net = (struct net *)ctl->extra1;
5169 delay = net->ipv6.sysctl.flush_delay;
5170 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5174 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5181 static struct ctl_table ipv6_route_table_template[] = {
5183 .procname = "flush",
5184 .data = &init_net.ipv6.sysctl.flush_delay,
5185 .maxlen = sizeof(int),
5187 .proc_handler = ipv6_sysctl_rtcache_flush
5190 .procname = "gc_thresh",
5191 .data = &ip6_dst_ops_template.gc_thresh,
5192 .maxlen = sizeof(int),
5194 .proc_handler = proc_dointvec,
5197 .procname = "max_size",
5198 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5199 .maxlen = sizeof(int),
5201 .proc_handler = proc_dointvec,
5204 .procname = "gc_min_interval",
5205 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5206 .maxlen = sizeof(int),
5208 .proc_handler = proc_dointvec_jiffies,
5211 .procname = "gc_timeout",
5212 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5213 .maxlen = sizeof(int),
5215 .proc_handler = proc_dointvec_jiffies,
5218 .procname = "gc_interval",
5219 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5220 .maxlen = sizeof(int),
5222 .proc_handler = proc_dointvec_jiffies,
5225 .procname = "gc_elasticity",
5226 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5227 .maxlen = sizeof(int),
5229 .proc_handler = proc_dointvec,
5232 .procname = "mtu_expires",
5233 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5234 .maxlen = sizeof(int),
5236 .proc_handler = proc_dointvec_jiffies,
5239 .procname = "min_adv_mss",
5240 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5241 .maxlen = sizeof(int),
5243 .proc_handler = proc_dointvec,
5246 .procname = "gc_min_interval_ms",
5247 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5248 .maxlen = sizeof(int),
5250 .proc_handler = proc_dointvec_ms_jiffies,
5253 .procname = "skip_notify_on_dev_down",
5254 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5255 .maxlen = sizeof(int),
5257 .proc_handler = proc_dointvec,
5264 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5266 struct ctl_table *table;
5268 table = kmemdup(ipv6_route_table_template,
5269 sizeof(ipv6_route_table_template),
5273 table[0].data = &net->ipv6.sysctl.flush_delay;
5274 table[0].extra1 = net;
5275 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5276 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5277 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5278 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5279 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5280 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5281 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5282 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5283 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5284 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5286 /* Don't export sysctls to unprivileged users */
5287 if (net->user_ns != &init_user_ns)
5288 table[0].procname = NULL;
5295 static int __net_init ip6_route_net_init(struct net *net)
5299 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5300 sizeof(net->ipv6.ip6_dst_ops));
5302 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5303 goto out_ip6_dst_ops;
5305 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5306 sizeof(*net->ipv6.fib6_null_entry),
5308 if (!net->ipv6.fib6_null_entry)
5309 goto out_ip6_dst_entries;
5311 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5312 sizeof(*net->ipv6.ip6_null_entry),
5314 if (!net->ipv6.ip6_null_entry)
5315 goto out_fib6_null_entry;
5316 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5317 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5318 ip6_template_metrics, true);
5320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5321 net->ipv6.fib6_has_custom_rules = false;
5322 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5323 sizeof(*net->ipv6.ip6_prohibit_entry),
5325 if (!net->ipv6.ip6_prohibit_entry)
5326 goto out_ip6_null_entry;
5327 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5328 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5329 ip6_template_metrics, true);
5331 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5332 sizeof(*net->ipv6.ip6_blk_hole_entry),
5334 if (!net->ipv6.ip6_blk_hole_entry)
5335 goto out_ip6_prohibit_entry;
5336 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5337 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5338 ip6_template_metrics, true);
5341 net->ipv6.sysctl.flush_delay = 0;
5342 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5343 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5344 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5345 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5346 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5347 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5348 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5349 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5351 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5358 out_ip6_prohibit_entry:
5359 kfree(net->ipv6.ip6_prohibit_entry);
5361 kfree(net->ipv6.ip6_null_entry);
5363 out_fib6_null_entry:
5364 kfree(net->ipv6.fib6_null_entry);
5365 out_ip6_dst_entries:
5366 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5371 static void __net_exit ip6_route_net_exit(struct net *net)
5373 kfree(net->ipv6.fib6_null_entry);
5374 kfree(net->ipv6.ip6_null_entry);
5375 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5376 kfree(net->ipv6.ip6_prohibit_entry);
5377 kfree(net->ipv6.ip6_blk_hole_entry);
5379 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5382 static int __net_init ip6_route_net_init_late(struct net *net)
5384 #ifdef CONFIG_PROC_FS
5385 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5386 sizeof(struct ipv6_route_iter));
5387 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5388 rt6_stats_seq_show, NULL);
5393 static void __net_exit ip6_route_net_exit_late(struct net *net)
5395 #ifdef CONFIG_PROC_FS
5396 remove_proc_entry("ipv6_route", net->proc_net);
5397 remove_proc_entry("rt6_stats", net->proc_net);
5401 static struct pernet_operations ip6_route_net_ops = {
5402 .init = ip6_route_net_init,
5403 .exit = ip6_route_net_exit,
5406 static int __net_init ipv6_inetpeer_init(struct net *net)
5408 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5412 inet_peer_base_init(bp);
5413 net->ipv6.peers = bp;
5417 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5419 struct inet_peer_base *bp = net->ipv6.peers;
5421 net->ipv6.peers = NULL;
5422 inetpeer_invalidate_tree(bp);
5426 static struct pernet_operations ipv6_inetpeer_ops = {
5427 .init = ipv6_inetpeer_init,
5428 .exit = ipv6_inetpeer_exit,
5431 static struct pernet_operations ip6_route_net_late_ops = {
5432 .init = ip6_route_net_init_late,
5433 .exit = ip6_route_net_exit_late,
5436 static struct notifier_block ip6_route_dev_notifier = {
5437 .notifier_call = ip6_route_dev_notify,
5438 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5441 void __init ip6_route_init_special_entries(void)
5443 /* Registering of the loopback is done before this portion of code,
5444 * the loopback reference in rt6_info will not be taken, do it
5445 * manually for init_net */
5446 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5447 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5448 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5449 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5450 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5451 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5452 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5453 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5457 int __init ip6_route_init(void)
5463 ip6_dst_ops_template.kmem_cachep =
5464 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5465 SLAB_HWCACHE_ALIGN, NULL);
5466 if (!ip6_dst_ops_template.kmem_cachep)
5469 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5471 goto out_kmem_cache;
5473 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5475 goto out_dst_entries;
5477 ret = register_pernet_subsys(&ip6_route_net_ops);
5479 goto out_register_inetpeer;
5481 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5485 goto out_register_subsys;
5491 ret = fib6_rules_init();
5495 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5497 goto fib6_rules_init;
5499 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5500 inet6_rtm_newroute, NULL, 0);
5502 goto out_register_late_subsys;
5504 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5505 inet6_rtm_delroute, NULL, 0);
5507 goto out_register_late_subsys;
5509 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5510 inet6_rtm_getroute, NULL,
5511 RTNL_FLAG_DOIT_UNLOCKED);
5513 goto out_register_late_subsys;
5515 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5517 goto out_register_late_subsys;
5519 for_each_possible_cpu(cpu) {
5520 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5522 INIT_LIST_HEAD(&ul->head);
5523 spin_lock_init(&ul->lock);
5529 out_register_late_subsys:
5530 rtnl_unregister_all(PF_INET6);
5531 unregister_pernet_subsys(&ip6_route_net_late_ops);
5533 fib6_rules_cleanup();
5538 out_register_subsys:
5539 unregister_pernet_subsys(&ip6_route_net_ops);
5540 out_register_inetpeer:
5541 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5543 dst_entries_destroy(&ip6_dst_blackhole_ops);
5545 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5549 void ip6_route_cleanup(void)
5551 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5552 unregister_pernet_subsys(&ip6_route_net_late_ops);
5553 fib6_rules_cleanup();
5556 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5557 unregister_pernet_subsys(&ip6_route_net_ops);
5558 dst_entries_destroy(&ip6_dst_blackhole_ops);
5559 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);