2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
147 spin_lock_bh(&ul->lock);
148 list_del(&rt->rt6i_uncached);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
187 return dst_metrics_write_ptr(rt->dst.from);
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
192 struct rt6_info *rt = (struct rt6_info *)dst;
194 if (rt->rt6i_flags & RTF_PCPU)
195 return rt6_pcpu_cow_metrics(rt);
196 else if (rt->rt6i_flags & RTF_CACHE)
199 return dst_cow_metrics_generic(dst, old);
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
206 struct in6_addr *p = &rt->rt6i_gateway;
208 if (!ipv6_addr_any(p))
209 return (const void *) p;
211 return &ipv6_hdr(skb)->daddr;
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
219 struct rt6_info *rt = (struct rt6_info *) dst;
222 daddr = choose_neigh_daddr(rt, skb, daddr);
223 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 return neigh_create(&nd_tbl, daddr, dst->dev);
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
231 struct net_device *dev = dst->dev;
232 struct rt6_info *rt = (struct rt6_info *)dst;
234 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
239 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
241 __ipv6_confirm_neigh(dev, daddr);
244 static struct dst_ops ip6_dst_ops_template = {
248 .check = ip6_dst_check,
249 .default_advmss = ip6_default_advmss,
251 .cow_metrics = ipv6_cow_metrics,
252 .destroy = ip6_dst_destroy,
253 .ifdown = ip6_dst_ifdown,
254 .negative_advice = ip6_negative_advice,
255 .link_failure = ip6_link_failure,
256 .update_pmtu = ip6_rt_update_pmtu,
257 .redirect = rt6_do_redirect,
258 .local_out = __ip6_local_out,
259 .neigh_lookup = ip6_neigh_lookup,
260 .confirm_neigh = ip6_confirm_neigh,
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
265 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
267 return mtu ? : dst->dev->mtu;
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271 struct sk_buff *skb, u32 mtu)
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
280 static struct dst_ops ip6_dst_blackhole_ops = {
282 .destroy = ip6_dst_destroy,
283 .check = ip6_dst_check,
284 .mtu = ip6_blackhole_mtu,
285 .default_advmss = ip6_default_advmss,
286 .update_pmtu = ip6_rt_blackhole_update_pmtu,
287 .redirect = ip6_rt_blackhole_redirect,
288 .cow_metrics = dst_cow_metrics_generic,
289 .neigh_lookup = ip6_neigh_lookup,
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293 [RTAX_HOPLIMIT - 1] = 0,
296 static const struct rt6_info ip6_null_entry_template = {
298 .__refcnt = ATOMIC_INIT(1),
300 .obsolete = DST_OBSOLETE_FORCE_CHK,
301 .error = -ENETUNREACH,
302 .input = ip6_pkt_discard,
303 .output = ip6_pkt_discard_out,
305 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
306 .rt6i_protocol = RTPROT_KERNEL,
307 .rt6i_metric = ~(u32) 0,
308 .rt6i_ref = ATOMIC_INIT(1),
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313 static const struct rt6_info ip6_prohibit_entry_template = {
315 .__refcnt = ATOMIC_INIT(1),
317 .obsolete = DST_OBSOLETE_FORCE_CHK,
319 .input = ip6_pkt_prohibit,
320 .output = ip6_pkt_prohibit_out,
322 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
323 .rt6i_protocol = RTPROT_KERNEL,
324 .rt6i_metric = ~(u32) 0,
325 .rt6i_ref = ATOMIC_INIT(1),
328 static const struct rt6_info ip6_blk_hole_entry_template = {
330 .__refcnt = ATOMIC_INIT(1),
332 .obsolete = DST_OBSOLETE_FORCE_CHK,
334 .input = dst_discard,
335 .output = dst_discard_out,
337 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
338 .rt6i_protocol = RTPROT_KERNEL,
339 .rt6i_metric = ~(u32) 0,
340 .rt6i_ref = ATOMIC_INIT(1),
345 static void rt6_info_init(struct rt6_info *rt)
347 struct dst_entry *dst = &rt->dst;
349 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350 INIT_LIST_HEAD(&rt->rt6i_siblings);
351 INIT_LIST_HEAD(&rt->rt6i_uncached);
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356 struct net_device *dev,
359 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360 1, DST_OBSOLETE_FORCE_CHK, flags);
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369 struct net_device *dev,
372 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
375 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
379 for_each_possible_cpu(cpu) {
382 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383 /* no one shares rt */
387 dst_release_immediate(&rt->dst);
394 EXPORT_SYMBOL(ip6_dst_alloc);
396 static void ip6_dst_destroy(struct dst_entry *dst)
398 struct rt6_info *rt = (struct rt6_info *)dst;
399 struct rt6_exception_bucket *bucket;
400 struct dst_entry *from = dst->from;
401 struct inet6_dev *idev;
403 dst_destroy_metrics_generic(dst);
404 free_percpu(rt->rt6i_pcpu);
405 rt6_uncached_list_del(rt);
407 idev = rt->rt6i_idev;
409 rt->rt6i_idev = NULL;
412 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
414 rt->rt6i_exception_bucket = NULL;
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
425 struct rt6_info *rt = (struct rt6_info *)dst;
426 struct inet6_dev *idev = rt->rt6i_idev;
427 struct net_device *loopback_dev =
428 dev_net(dev)->loopback_dev;
430 if (idev && idev->dev != loopback_dev) {
431 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
433 rt->rt6i_idev = loopback_idev;
439 static bool __rt6_check_expired(const struct rt6_info *rt)
441 if (rt->rt6i_flags & RTF_EXPIRES)
442 return time_after(jiffies, rt->dst.expires);
447 static bool rt6_check_expired(const struct rt6_info *rt)
449 if (rt->rt6i_flags & RTF_EXPIRES) {
450 if (time_after(jiffies, rt->dst.expires))
452 } else if (rt->dst.from) {
453 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454 rt6_check_expired((struct rt6_info *)rt->dst.from);
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460 struct flowi6 *fl6, int oif,
463 struct rt6_info *sibling, *next_sibling;
466 /* We might have already computed the hash for ICMPv6 errors. In such
467 * case it will always be non-zero. Otherwise now is the time to do it.
470 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
472 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473 /* Don't change the route, if route_choosen == 0
474 * (siblings does not include ourself)
477 list_for_each_entry_safe(sibling, next_sibling,
478 &match->rt6i_siblings, rt6i_siblings) {
480 if (route_choosen == 0) {
481 if (rt6_score_route(sibling, oif, strict) < 0)
491 * Route lookup. Any table->tb6_lock is implied.
494 static inline struct rt6_info *rt6_device_match(struct net *net,
496 const struct in6_addr *saddr,
500 struct rt6_info *local = NULL;
501 struct rt6_info *sprt;
503 if (!oif && ipv6_addr_any(saddr))
506 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507 struct net_device *dev = sprt->dst.dev;
510 if (dev->ifindex == oif)
512 if (dev->flags & IFF_LOOPBACK) {
513 if (!sprt->rt6i_idev ||
514 sprt->rt6i_idev->dev->ifindex != oif) {
515 if (flags & RT6_LOOKUP_F_IFACE)
518 local->rt6i_idev->dev->ifindex == oif)
524 if (ipv6_chk_addr(net, saddr, dev,
525 flags & RT6_LOOKUP_F_IFACE))
534 if (flags & RT6_LOOKUP_F_IFACE)
535 return net->ipv6.ip6_null_entry;
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543 struct work_struct work;
544 struct in6_addr target;
545 struct net_device *dev;
548 static void rt6_probe_deferred(struct work_struct *w)
550 struct in6_addr mcaddr;
551 struct __rt6_probe_work *work =
552 container_of(w, struct __rt6_probe_work, work);
554 addrconf_addr_solict_mult(&work->target, &mcaddr);
555 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560 static void rt6_probe(struct rt6_info *rt)
562 struct __rt6_probe_work *work;
563 struct neighbour *neigh;
565 * Okay, this does not seem to be appropriate
566 * for now, however, we need to check if it
567 * is really so; aka Router Reachability Probing.
569 * Router Reachability Probe MUST be rate-limited
570 * to no more than one per minute.
572 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
575 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
577 if (neigh->nud_state & NUD_VALID)
581 write_lock(&neigh->lock);
582 if (!(neigh->nud_state & NUD_VALID) &&
585 rt->rt6i_idev->cnf.rtr_probe_interval)) {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 __neigh_set_probe_once(neigh);
590 write_unlock(&neigh->lock);
592 work = kmalloc(sizeof(*work), GFP_ATOMIC);
596 INIT_WORK(&work->work, rt6_probe_deferred);
597 work->target = rt->rt6i_gateway;
598 dev_hold(rt->dst.dev);
599 work->dev = rt->dst.dev;
600 schedule_work(&work->work);
604 rcu_read_unlock_bh();
607 static inline void rt6_probe(struct rt6_info *rt)
613 * Default Router Selection (RFC 2461 6.3.6)
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
617 struct net_device *dev = rt->dst.dev;
618 if (!oif || dev->ifindex == oif)
620 if ((dev->flags & IFF_LOOPBACK) &&
621 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
628 struct neighbour *neigh;
629 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
631 if (rt->rt6i_flags & RTF_NONEXTHOP ||
632 !(rt->rt6i_flags & RTF_GATEWAY))
633 return RT6_NUD_SUCCEED;
636 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
638 read_lock(&neigh->lock);
639 if (neigh->nud_state & NUD_VALID)
640 ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642 else if (!(neigh->nud_state & NUD_FAILED))
643 ret = RT6_NUD_SUCCEED;
645 ret = RT6_NUD_FAIL_PROBE;
647 read_unlock(&neigh->lock);
649 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
652 rcu_read_unlock_bh();
657 static int rt6_score_route(struct rt6_info *rt, int oif,
662 m = rt6_check_dev(rt, oif);
663 if (!m && (strict & RT6_LOOKUP_F_IFACE))
664 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
668 if (strict & RT6_LOOKUP_F_REACHABLE) {
669 int n = rt6_check_neigh(rt);
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677 int *mpri, struct rt6_info *match,
681 bool match_do_rr = false;
682 struct inet6_dev *idev = rt->rt6i_idev;
683 struct net_device *dev = rt->dst.dev;
685 if (dev && !netif_carrier_ok(dev) &&
686 idev->cnf.ignore_routes_with_linkdown &&
687 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
690 if (rt6_check_expired(rt))
693 m = rt6_score_route(rt, oif, strict);
694 if (m == RT6_NUD_FAIL_DO_RR) {
696 m = 0; /* lowest valid score */
697 } else if (m == RT6_NUD_FAIL_HARD) {
701 if (strict & RT6_LOOKUP_F_REACHABLE)
704 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
706 *do_rr = match_do_rr;
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715 struct rt6_info *leaf,
716 struct rt6_info *rr_head,
717 u32 metric, int oif, int strict,
720 struct rt6_info *rt, *match, *cont;
725 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
726 if (rt->rt6i_metric != metric) {
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734 for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
735 if (rt->rt6i_metric != metric) {
740 match = find_match(rt, oif, strict, &mpri, match, do_rr);
746 for (rt = cont; rt; rt = rt->dst.rt6_next)
747 match = find_match(rt, oif, strict, &mpri, match, do_rr);
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
755 struct rt6_info *leaf = fn->leaf;
756 struct rt6_info *match, *rt0;
761 return net->ipv6.ip6_null_entry;
765 fn->rr_ptr = rt0 = leaf;
767 /* Double check to make sure fn is not an intermediate node
768 * and fn->leaf does not points to its child's leaf
769 * (This might happen if all routes under fn are deleted from
770 * the tree and fib6_repair_tree() is called on the node.)
772 key_plen = rt0->rt6i_dst.plen;
773 #ifdef CONFIG_IPV6_SUBTREES
774 if (rt0->rt6i_src.plen)
775 key_plen = rt0->rt6i_src.plen;
777 if (fn->fn_bit != key_plen)
778 return net->ipv6.ip6_null_entry;
780 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
784 struct rt6_info *next = rt0->dst.rt6_next;
786 /* no entries matched; do round-robin */
787 if (!next || next->rt6i_metric != rt0->rt6i_metric)
794 return match ? match : net->ipv6.ip6_null_entry;
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
799 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804 const struct in6_addr *gwaddr)
806 struct net *net = dev_net(dev);
807 struct route_info *rinfo = (struct route_info *) opt;
808 struct in6_addr prefix_buf, *prefix;
810 unsigned long lifetime;
813 if (len < sizeof(struct route_info)) {
817 /* Sanity check for prefix_len and length */
818 if (rinfo->length > 3) {
820 } else if (rinfo->prefix_len > 128) {
822 } else if (rinfo->prefix_len > 64) {
823 if (rinfo->length < 2) {
826 } else if (rinfo->prefix_len > 0) {
827 if (rinfo->length < 1) {
832 pref = rinfo->route_pref;
833 if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
838 if (rinfo->length == 3)
839 prefix = (struct in6_addr *)rinfo->prefix;
841 /* this function is safe */
842 ipv6_addr_prefix(&prefix_buf,
843 (struct in6_addr *)rinfo->prefix,
845 prefix = &prefix_buf;
848 if (rinfo->prefix_len == 0)
849 rt = rt6_get_dflt_router(gwaddr, dev);
851 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 if (rt && !lifetime) {
860 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 rt->rt6i_flags = RTF_ROUTEINFO |
864 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867 if (!addrconf_finite_timeout(lifetime))
868 rt6_clean_expires(rt);
870 rt6_set_expires(rt, jiffies + HZ * lifetime);
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879 struct in6_addr *saddr)
881 struct fib6_node *pn;
883 if (fn->fn_flags & RTN_TL_ROOT)
886 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
887 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
890 if (fn->fn_flags & RTN_RTINFO)
895 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
898 struct rt6_info *rt = *prt;
900 if (dst_hold_safe(&rt->dst))
903 rt = net->ipv6.ip6_null_entry;
912 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
913 struct fib6_table *table,
914 struct flowi6 *fl6, int flags)
916 struct rt6_info *rt, *rt_cache;
917 struct fib6_node *fn;
919 read_lock_bh(&table->tb6_lock);
920 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
923 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
924 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
925 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
926 if (rt == net->ipv6.ip6_null_entry) {
927 fn = fib6_backtrack(fn, &fl6->saddr);
931 /* Search through exception table */
932 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
936 if (ip6_hold_safe(net, &rt, true))
937 dst_use_noref(&rt->dst, jiffies);
939 read_unlock_bh(&table->tb6_lock);
941 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
947 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
950 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
952 EXPORT_SYMBOL_GPL(ip6_route_lookup);
954 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
955 const struct in6_addr *saddr, int oif, int strict)
957 struct flowi6 fl6 = {
961 struct dst_entry *dst;
962 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
965 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
966 flags |= RT6_LOOKUP_F_HAS_SADDR;
969 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
971 return (struct rt6_info *) dst;
977 EXPORT_SYMBOL(rt6_lookup);
979 /* ip6_ins_rt is called with FREE table->tb6_lock.
980 * It takes new route entry, the addition fails by any reason the
982 * Caller must hold dst before calling it.
985 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
986 struct mx6_config *mxc,
987 struct netlink_ext_ack *extack)
990 struct fib6_table *table;
992 table = rt->rt6i_table;
993 write_lock_bh(&table->tb6_lock);
994 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
995 write_unlock_bh(&table->tb6_lock);
1000 int ip6_ins_rt(struct rt6_info *rt)
1002 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1003 struct mx6_config mxc = { .mx = NULL, };
1005 /* Hold dst to account for the reference from the fib6 tree */
1007 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1010 /* called with rcu_lock held */
1011 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1013 struct net_device *dev = rt->dst.dev;
1015 if (rt->rt6i_flags & RTF_LOCAL) {
1016 /* for copies of local routes, dst->dev needs to be the
1017 * device if it is a master device, the master device if
1018 * device is enslaved, and the loopback as the default
1020 if (netif_is_l3_slave(dev) &&
1021 !rt6_need_strict(&rt->rt6i_dst.addr))
1022 dev = l3mdev_master_dev_rcu(dev);
1023 else if (!netif_is_l3_master(dev))
1024 dev = dev_net(dev)->loopback_dev;
1025 /* last case is netif_is_l3_master(dev) is true in which
1026 * case we want dev returned to be dev
1033 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1034 const struct in6_addr *daddr,
1035 const struct in6_addr *saddr)
1037 struct net_device *dev;
1038 struct rt6_info *rt;
1044 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1045 ort = (struct rt6_info *)ort->dst.from;
1048 dev = ip6_rt_get_dev_rcu(ort);
1049 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1054 ip6_rt_copy_init(rt, ort);
1055 rt->rt6i_flags |= RTF_CACHE;
1056 rt->rt6i_metric = 0;
1057 rt->dst.flags |= DST_HOST;
1058 rt->rt6i_dst.addr = *daddr;
1059 rt->rt6i_dst.plen = 128;
1061 if (!rt6_is_gw_or_nonexthop(ort)) {
1062 if (ort->rt6i_dst.plen != 128 &&
1063 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1064 rt->rt6i_flags |= RTF_ANYCAST;
1065 #ifdef CONFIG_IPV6_SUBTREES
1066 if (rt->rt6i_src.plen && saddr) {
1067 rt->rt6i_src.addr = *saddr;
1068 rt->rt6i_src.plen = 128;
1076 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1078 struct net_device *dev;
1079 struct rt6_info *pcpu_rt;
1082 dev = ip6_rt_get_dev_rcu(rt);
1083 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1087 ip6_rt_copy_init(pcpu_rt, rt);
1088 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1089 pcpu_rt->rt6i_flags |= RTF_PCPU;
1093 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1094 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1096 struct rt6_info *pcpu_rt, **p;
1098 p = this_cpu_ptr(rt->rt6i_pcpu);
1101 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1102 rt6_dst_from_metrics_check(pcpu_rt);
1107 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1109 struct rt6_info *pcpu_rt, *prev, **p;
1111 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1113 struct net *net = dev_net(rt->dst.dev);
1115 dst_hold(&net->ipv6.ip6_null_entry->dst);
1116 return net->ipv6.ip6_null_entry;
1119 dst_hold(&pcpu_rt->dst);
1120 p = this_cpu_ptr(rt->rt6i_pcpu);
1121 prev = cmpxchg(p, NULL, pcpu_rt);
1123 /* If someone did it before us, return prev instead */
1124 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1125 dst_release_immediate(&pcpu_rt->dst);
1126 /* release refcnt taken by above dst_hold() */
1127 dst_release_immediate(&pcpu_rt->dst);
1128 dst_hold(&prev->dst);
1132 rt6_dst_from_metrics_check(pcpu_rt);
1136 /* exception hash table implementation
1138 static DEFINE_SPINLOCK(rt6_exception_lock);
1140 /* Remove rt6_ex from hash table and free the memory
1141 * Caller must hold rt6_exception_lock
1143 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1144 struct rt6_exception *rt6_ex)
1146 if (!bucket || !rt6_ex)
1148 rt6_ex->rt6i->rt6i_node = NULL;
1149 hlist_del_rcu(&rt6_ex->hlist);
1150 rt6_release(rt6_ex->rt6i);
1151 kfree_rcu(rt6_ex, rcu);
1152 WARN_ON_ONCE(!bucket->depth);
1156 /* Remove oldest rt6_ex in bucket and free the memory
1157 * Caller must hold rt6_exception_lock
1159 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1161 struct rt6_exception *rt6_ex, *oldest = NULL;
1166 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1167 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1170 rt6_remove_exception(bucket, oldest);
1173 static u32 rt6_exception_hash(const struct in6_addr *dst,
1174 const struct in6_addr *src)
1176 static u32 seed __read_mostly;
1179 net_get_random_once(&seed, sizeof(seed));
1180 val = jhash(dst, sizeof(*dst), seed);
1182 #ifdef CONFIG_IPV6_SUBTREES
1184 val = jhash(src, sizeof(*src), val);
1186 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1189 /* Helper function to find the cached rt in the hash table
1190 * and update bucket pointer to point to the bucket for this
1191 * (daddr, saddr) pair
1192 * Caller must hold rt6_exception_lock
1194 static struct rt6_exception *
1195 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1196 const struct in6_addr *daddr,
1197 const struct in6_addr *saddr)
1199 struct rt6_exception *rt6_ex;
1202 if (!(*bucket) || !daddr)
1205 hval = rt6_exception_hash(daddr, saddr);
1208 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1209 struct rt6_info *rt6 = rt6_ex->rt6i;
1210 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1212 #ifdef CONFIG_IPV6_SUBTREES
1213 if (matched && saddr)
1214 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1222 /* Helper function to find the cached rt in the hash table
1223 * and update bucket pointer to point to the bucket for this
1224 * (daddr, saddr) pair
1225 * Caller must hold rcu_read_lock()
1227 static struct rt6_exception *
1228 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1229 const struct in6_addr *daddr,
1230 const struct in6_addr *saddr)
1232 struct rt6_exception *rt6_ex;
1235 WARN_ON_ONCE(!rcu_read_lock_held());
1237 if (!(*bucket) || !daddr)
1240 hval = rt6_exception_hash(daddr, saddr);
1243 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1244 struct rt6_info *rt6 = rt6_ex->rt6i;
1245 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1247 #ifdef CONFIG_IPV6_SUBTREES
1248 if (matched && saddr)
1249 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1257 static int rt6_insert_exception(struct rt6_info *nrt,
1258 struct rt6_info *ort)
1260 struct rt6_exception_bucket *bucket;
1261 struct in6_addr *src_key = NULL;
1262 struct rt6_exception *rt6_ex;
1265 /* ort can't be a cache or pcpu route */
1266 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1267 ort = (struct rt6_info *)ort->dst.from;
1268 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1270 spin_lock_bh(&rt6_exception_lock);
1272 if (ort->exception_bucket_flushed) {
1277 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1278 lockdep_is_held(&rt6_exception_lock));
1280 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1286 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1289 #ifdef CONFIG_IPV6_SUBTREES
1290 /* rt6i_src.plen != 0 indicates ort is in subtree
1291 * and exception table is indexed by a hash of
1292 * both rt6i_dst and rt6i_src.
1293 * Otherwise, the exception table is indexed by
1294 * a hash of only rt6i_dst.
1296 if (ort->rt6i_src.plen)
1297 src_key = &nrt->rt6i_src.addr;
1300 /* Update rt6i_prefsrc as it could be changed
1301 * in rt6_remove_prefsrc()
1303 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1304 /* rt6_mtu_change() might lower mtu on ort.
1305 * Only insert this exception route if its mtu
1306 * is less than ort's mtu value.
1308 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1313 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1316 rt6_remove_exception(bucket, rt6_ex);
1318 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1324 rt6_ex->stamp = jiffies;
1325 atomic_inc(&nrt->rt6i_ref);
1326 nrt->rt6i_node = ort->rt6i_node;
1327 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1330 if (bucket->depth > FIB6_MAX_DEPTH)
1331 rt6_exception_remove_oldest(bucket);
1334 spin_unlock_bh(&rt6_exception_lock);
1336 /* Update fn->fn_sernum to invalidate all cached dst */
1338 fib6_update_sernum(ort);
1343 void rt6_flush_exceptions(struct rt6_info *rt)
1345 struct rt6_exception_bucket *bucket;
1346 struct rt6_exception *rt6_ex;
1347 struct hlist_node *tmp;
1350 spin_lock_bh(&rt6_exception_lock);
1351 /* Prevent rt6_insert_exception() to recreate the bucket list */
1352 rt->exception_bucket_flushed = 1;
1354 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1355 lockdep_is_held(&rt6_exception_lock));
1359 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1360 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1361 rt6_remove_exception(bucket, rt6_ex);
1362 WARN_ON_ONCE(bucket->depth);
1367 spin_unlock_bh(&rt6_exception_lock);
1370 /* Find cached rt in the hash table inside passed in rt
1371 * Caller has to hold rcu_read_lock()
1373 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1374 struct in6_addr *daddr,
1375 struct in6_addr *saddr)
1377 struct rt6_exception_bucket *bucket;
1378 struct in6_addr *src_key = NULL;
1379 struct rt6_exception *rt6_ex;
1380 struct rt6_info *res = NULL;
1382 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1384 #ifdef CONFIG_IPV6_SUBTREES
1385 /* rt6i_src.plen != 0 indicates rt is in subtree
1386 * and exception table is indexed by a hash of
1387 * both rt6i_dst and rt6i_src.
1388 * Otherwise, the exception table is indexed by
1389 * a hash of only rt6i_dst.
1391 if (rt->rt6i_src.plen)
1394 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1396 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1402 /* Remove the passed in cached rt from the hash table that contains it */
1403 int rt6_remove_exception_rt(struct rt6_info *rt)
1405 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1406 struct rt6_exception_bucket *bucket;
1407 struct in6_addr *src_key = NULL;
1408 struct rt6_exception *rt6_ex;
1412 !(rt->rt6i_flags | RTF_CACHE))
1415 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1418 spin_lock_bh(&rt6_exception_lock);
1419 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1420 lockdep_is_held(&rt6_exception_lock));
1421 #ifdef CONFIG_IPV6_SUBTREES
1422 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1423 * and exception table is indexed by a hash of
1424 * both rt6i_dst and rt6i_src.
1425 * Otherwise, the exception table is indexed by
1426 * a hash of only rt6i_dst.
1428 if (from->rt6i_src.plen)
1429 src_key = &rt->rt6i_src.addr;
1431 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1435 rt6_remove_exception(bucket, rt6_ex);
1441 spin_unlock_bh(&rt6_exception_lock);
1445 /* Find rt6_ex which contains the passed in rt cache and
1448 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1450 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1451 struct rt6_exception_bucket *bucket;
1452 struct in6_addr *src_key = NULL;
1453 struct rt6_exception *rt6_ex;
1456 !(rt->rt6i_flags | RTF_CACHE))
1460 bucket = rcu_dereference(from->rt6i_exception_bucket);
1462 #ifdef CONFIG_IPV6_SUBTREES
1463 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1464 * and exception table is indexed by a hash of
1465 * both rt6i_dst and rt6i_src.
1466 * Otherwise, the exception table is indexed by
1467 * a hash of only rt6i_dst.
1469 if (from->rt6i_src.plen)
1470 src_key = &rt->rt6i_src.addr;
1472 rt6_ex = __rt6_find_exception_rcu(&bucket,
1476 rt6_ex->stamp = jiffies;
1481 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1483 struct rt6_exception_bucket *bucket;
1484 struct rt6_exception *rt6_ex;
1487 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1488 lockdep_is_held(&rt6_exception_lock));
1491 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1492 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1493 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1500 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1502 struct rt6_exception_bucket *bucket;
1503 struct rt6_exception *rt6_ex;
1506 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1507 lockdep_is_held(&rt6_exception_lock));
1510 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1511 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1512 struct rt6_info *entry = rt6_ex->rt6i;
1513 /* For RTF_CACHE with rt6i_pmtu == 0
1514 * (i.e. a redirected route),
1515 * the metrics of its rt->dst.from has already
1518 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1519 entry->rt6i_pmtu = mtu;
1526 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1528 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1529 struct in6_addr *gateway)
1531 struct rt6_exception_bucket *bucket;
1532 struct rt6_exception *rt6_ex;
1533 struct hlist_node *tmp;
1536 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1539 spin_lock_bh(&rt6_exception_lock);
1540 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1541 lockdep_is_held(&rt6_exception_lock));
1544 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1545 hlist_for_each_entry_safe(rt6_ex, tmp,
1546 &bucket->chain, hlist) {
1547 struct rt6_info *entry = rt6_ex->rt6i;
1549 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1550 RTF_CACHE_GATEWAY &&
1551 ipv6_addr_equal(gateway,
1552 &entry->rt6i_gateway)) {
1553 rt6_remove_exception(bucket, rt6_ex);
1560 spin_unlock_bh(&rt6_exception_lock);
1563 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1564 struct rt6_exception *rt6_ex,
1565 struct fib6_gc_args *gc_args,
1568 struct rt6_info *rt = rt6_ex->rt6i;
1570 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1571 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1572 RT6_TRACE("aging clone %p\n", rt);
1573 rt6_remove_exception(bucket, rt6_ex);
1575 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1576 struct neighbour *neigh;
1577 __u8 neigh_flags = 0;
1579 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1581 neigh_flags = neigh->flags;
1582 neigh_release(neigh);
1584 if (!(neigh_flags & NTF_ROUTER)) {
1585 RT6_TRACE("purging route %p via non-router but gateway\n",
1587 rt6_remove_exception(bucket, rt6_ex);
1594 void rt6_age_exceptions(struct rt6_info *rt,
1595 struct fib6_gc_args *gc_args,
1598 struct rt6_exception_bucket *bucket;
1599 struct rt6_exception *rt6_ex;
1600 struct hlist_node *tmp;
1603 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1606 spin_lock_bh(&rt6_exception_lock);
1607 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1608 lockdep_is_held(&rt6_exception_lock));
1611 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1612 hlist_for_each_entry_safe(rt6_ex, tmp,
1613 &bucket->chain, hlist) {
1614 rt6_age_examine_exception(bucket, rt6_ex,
1620 spin_unlock_bh(&rt6_exception_lock);
1623 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1624 int oif, struct flowi6 *fl6, int flags)
1626 struct fib6_node *fn, *saved_fn;
1627 struct rt6_info *rt, *rt_cache;
1630 strict |= flags & RT6_LOOKUP_F_IFACE;
1631 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1632 if (net->ipv6.devconf_all->forwarding == 0)
1633 strict |= RT6_LOOKUP_F_REACHABLE;
1635 read_lock_bh(&table->tb6_lock);
1637 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1640 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1644 rt = rt6_select(net, fn, oif, strict);
1645 if (rt->rt6i_nsiblings)
1646 rt = rt6_multipath_select(rt, fl6, oif, strict);
1647 if (rt == net->ipv6.ip6_null_entry) {
1648 fn = fib6_backtrack(fn, &fl6->saddr);
1650 goto redo_rt6_select;
1651 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1652 /* also consider unreachable route */
1653 strict &= ~RT6_LOOKUP_F_REACHABLE;
1655 goto redo_rt6_select;
1659 /*Search through exception table */
1660 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1664 if (rt == net->ipv6.ip6_null_entry) {
1665 read_unlock_bh(&table->tb6_lock);
1667 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1669 } else if (rt->rt6i_flags & RTF_CACHE) {
1670 if (ip6_hold_safe(net, &rt, true)) {
1671 dst_use_noref(&rt->dst, jiffies);
1672 rt6_dst_from_metrics_check(rt);
1674 read_unlock_bh(&table->tb6_lock);
1675 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1677 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1678 !(rt->rt6i_flags & RTF_GATEWAY))) {
1679 /* Create a RTF_CACHE clone which will not be
1680 * owned by the fib6 tree. It is for the special case where
1681 * the daddr in the skb during the neighbor look-up is different
1682 * from the fl6->daddr used to look-up route here.
1685 struct rt6_info *uncached_rt;
1687 if (ip6_hold_safe(net, &rt, true)) {
1688 dst_use_noref(&rt->dst, jiffies);
1690 read_unlock_bh(&table->tb6_lock);
1692 goto uncached_rt_out;
1694 read_unlock_bh(&table->tb6_lock);
1696 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1697 dst_release(&rt->dst);
1700 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1701 * No need for another dst_hold()
1703 rt6_uncached_list_add(uncached_rt);
1705 uncached_rt = net->ipv6.ip6_null_entry;
1706 dst_hold(&uncached_rt->dst);
1710 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1714 /* Get a percpu copy */
1716 struct rt6_info *pcpu_rt;
1718 dst_use_noref(&rt->dst, jiffies);
1719 pcpu_rt = rt6_get_pcpu_route(rt);
1722 read_unlock_bh(&table->tb6_lock);
1724 /* atomic_inc_not_zero() is needed when using rcu */
1725 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1726 /* We have to do the read_unlock first
1727 * because rt6_make_pcpu_route() may trigger
1728 * ip6_dst_gc() which will take the write_lock.
1730 * No dst_hold() on rt is needed because grabbing
1731 * rt->rt6i_ref makes sure rt can't be released.
1733 read_unlock_bh(&table->tb6_lock);
1734 pcpu_rt = rt6_make_pcpu_route(rt);
1737 /* rt is already removed from tree */
1738 read_unlock_bh(&table->tb6_lock);
1739 pcpu_rt = net->ipv6.ip6_null_entry;
1740 dst_hold(&pcpu_rt->dst);
1744 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1748 EXPORT_SYMBOL_GPL(ip6_pol_route);
1750 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1751 struct flowi6 *fl6, int flags)
1753 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1756 struct dst_entry *ip6_route_input_lookup(struct net *net,
1757 struct net_device *dev,
1758 struct flowi6 *fl6, int flags)
1760 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1761 flags |= RT6_LOOKUP_F_IFACE;
1763 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1765 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1767 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1768 struct flow_keys *keys)
1770 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1771 const struct ipv6hdr *key_iph = outer_iph;
1772 const struct ipv6hdr *inner_iph;
1773 const struct icmp6hdr *icmph;
1774 struct ipv6hdr _inner_iph;
1776 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1779 icmph = icmp6_hdr(skb);
1780 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1781 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1782 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1783 icmph->icmp6_type != ICMPV6_PARAMPROB)
1786 inner_iph = skb_header_pointer(skb,
1787 skb_transport_offset(skb) + sizeof(*icmph),
1788 sizeof(_inner_iph), &_inner_iph);
1792 key_iph = inner_iph;
1794 memset(keys, 0, sizeof(*keys));
1795 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1796 keys->addrs.v6addrs.src = key_iph->saddr;
1797 keys->addrs.v6addrs.dst = key_iph->daddr;
1798 keys->tags.flow_label = ip6_flowinfo(key_iph);
1799 keys->basic.ip_proto = key_iph->nexthdr;
1802 /* if skb is set it will be used and fl6 can be NULL */
1803 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1805 struct flow_keys hash_keys;
1808 ip6_multipath_l3_keys(skb, &hash_keys);
1809 return flow_hash_from_keys(&hash_keys);
1812 return get_hash_from_flowi6(fl6);
1815 void ip6_route_input(struct sk_buff *skb)
1817 const struct ipv6hdr *iph = ipv6_hdr(skb);
1818 struct net *net = dev_net(skb->dev);
1819 int flags = RT6_LOOKUP_F_HAS_SADDR;
1820 struct ip_tunnel_info *tun_info;
1821 struct flowi6 fl6 = {
1822 .flowi6_iif = skb->dev->ifindex,
1823 .daddr = iph->daddr,
1824 .saddr = iph->saddr,
1825 .flowlabel = ip6_flowinfo(iph),
1826 .flowi6_mark = skb->mark,
1827 .flowi6_proto = iph->nexthdr,
1830 tun_info = skb_tunnel_info(skb);
1831 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1832 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1833 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1834 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1836 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1839 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1840 struct flowi6 *fl6, int flags)
1842 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1845 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1846 struct flowi6 *fl6, int flags)
1850 if (rt6_need_strict(&fl6->daddr)) {
1851 struct dst_entry *dst;
1853 dst = l3mdev_link_scope_lookup(net, fl6);
1858 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1860 any_src = ipv6_addr_any(&fl6->saddr);
1861 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1862 (fl6->flowi6_oif && any_src))
1863 flags |= RT6_LOOKUP_F_IFACE;
1866 flags |= RT6_LOOKUP_F_HAS_SADDR;
1868 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1870 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1872 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1874 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1876 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1877 struct net_device *loopback_dev = net->loopback_dev;
1878 struct dst_entry *new = NULL;
1880 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1881 DST_OBSOLETE_NONE, 0);
1887 new->input = dst_discard;
1888 new->output = dst_discard_out;
1890 dst_copy_metrics(new, &ort->dst);
1892 rt->rt6i_idev = in6_dev_get(loopback_dev);
1893 rt->rt6i_gateway = ort->rt6i_gateway;
1894 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1895 rt->rt6i_metric = 0;
1897 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1898 #ifdef CONFIG_IPV6_SUBTREES
1899 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1903 dst_release(dst_orig);
1904 return new ? new : ERR_PTR(-ENOMEM);
1908 * Destination cache support functions
1911 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1914 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1915 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1918 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1922 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1925 if (rt6_check_expired(rt))
1931 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1933 if (!__rt6_check_expired(rt) &&
1934 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1935 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1943 struct rt6_info *rt;
1945 rt = (struct rt6_info *) dst;
1947 /* All IPV6 dsts are created with ->obsolete set to the value
1948 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1949 * into this function always.
1952 rt6_dst_from_metrics_check(rt);
1954 if (rt->rt6i_flags & RTF_PCPU ||
1955 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1956 return rt6_dst_from_check(rt, cookie);
1958 return rt6_check(rt, cookie);
1961 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1963 struct rt6_info *rt = (struct rt6_info *) dst;
1966 if (rt->rt6i_flags & RTF_CACHE) {
1967 if (rt6_check_expired(rt)) {
1979 static void ip6_link_failure(struct sk_buff *skb)
1981 struct rt6_info *rt;
1983 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1985 rt = (struct rt6_info *) skb_dst(skb);
1987 if (rt->rt6i_flags & RTF_CACHE) {
1988 if (dst_hold_safe(&rt->dst))
1991 struct fib6_node *fn;
1994 fn = rcu_dereference(rt->rt6i_node);
1995 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2002 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2004 struct net *net = dev_net(rt->dst.dev);
2006 rt->rt6i_flags |= RTF_MODIFIED;
2007 rt->rt6i_pmtu = mtu;
2008 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2011 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2013 return !(rt->rt6i_flags & RTF_CACHE) &&
2014 (rt->rt6i_flags & RTF_PCPU ||
2015 rcu_access_pointer(rt->rt6i_node));
2018 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2019 const struct ipv6hdr *iph, u32 mtu)
2021 const struct in6_addr *daddr, *saddr;
2022 struct rt6_info *rt6 = (struct rt6_info *)dst;
2024 if (rt6->rt6i_flags & RTF_LOCAL)
2027 if (dst_metric_locked(dst, RTAX_MTU))
2031 daddr = &iph->daddr;
2032 saddr = &iph->saddr;
2034 daddr = &sk->sk_v6_daddr;
2035 saddr = &inet6_sk(sk)->saddr;
2040 dst_confirm_neigh(dst, daddr);
2041 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2042 if (mtu >= dst_mtu(dst))
2045 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2046 rt6_do_update_pmtu(rt6, mtu);
2047 /* update rt6_ex->stamp for cache */
2048 if (rt6->rt6i_flags & RTF_CACHE)
2049 rt6_update_exception_stamp_rt(rt6);
2051 struct rt6_info *nrt6;
2053 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2055 rt6_do_update_pmtu(nrt6, mtu);
2056 if (rt6_insert_exception(nrt6, rt6))
2057 dst_release_immediate(&nrt6->dst);
2062 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2063 struct sk_buff *skb, u32 mtu)
2065 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2069 int oif, u32 mark, kuid_t uid)
2071 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2072 struct dst_entry *dst;
2075 memset(&fl6, 0, sizeof(fl6));
2076 fl6.flowi6_oif = oif;
2077 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2078 fl6.daddr = iph->daddr;
2079 fl6.saddr = iph->saddr;
2080 fl6.flowlabel = ip6_flowinfo(iph);
2081 fl6.flowi6_uid = uid;
2083 dst = ip6_route_output(net, NULL, &fl6);
2085 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2092 struct dst_entry *dst;
2094 ip6_update_pmtu(skb, sock_net(sk), mtu,
2095 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2097 dst = __sk_dst_get(sk);
2098 if (!dst || !dst->obsolete ||
2099 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2103 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2104 ip6_datagram_dst_update(sk, false);
2107 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2109 /* Handle redirects */
2110 struct ip6rd_flowi {
2112 struct in6_addr gateway;
2115 static struct rt6_info *__ip6_route_redirect(struct net *net,
2116 struct fib6_table *table,
2120 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2121 struct rt6_info *rt, *rt_cache;
2122 struct fib6_node *fn;
2124 /* Get the "current" route for this destination and
2125 * check if the redirect has come from appropriate router.
2127 * RFC 4861 specifies that redirects should only be
2128 * accepted if they come from the nexthop to the target.
2129 * Due to the way the routes are chosen, this notion
2130 * is a bit fuzzy and one might need to check all possible
2134 read_lock_bh(&table->tb6_lock);
2135 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2137 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2138 if (rt6_check_expired(rt))
2142 if (!(rt->rt6i_flags & RTF_GATEWAY))
2144 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2146 /* rt_cache's gateway might be different from its 'parent'
2147 * in the case of an ip redirect.
2148 * So we keep searching in the exception table if the gateway
2151 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2152 rt_cache = rt6_find_cached_rt(rt,
2156 ipv6_addr_equal(&rdfl->gateway,
2157 &rt_cache->rt6i_gateway)) {
2167 rt = net->ipv6.ip6_null_entry;
2168 else if (rt->dst.error) {
2169 rt = net->ipv6.ip6_null_entry;
2173 if (rt == net->ipv6.ip6_null_entry) {
2174 fn = fib6_backtrack(fn, &fl6->saddr);
2180 ip6_hold_safe(net, &rt, true);
2182 read_unlock_bh(&table->tb6_lock);
2184 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2188 static struct dst_entry *ip6_route_redirect(struct net *net,
2189 const struct flowi6 *fl6,
2190 const struct in6_addr *gateway)
2192 int flags = RT6_LOOKUP_F_HAS_SADDR;
2193 struct ip6rd_flowi rdfl;
2196 rdfl.gateway = *gateway;
2198 return fib6_rule_lookup(net, &rdfl.fl6,
2199 flags, __ip6_route_redirect);
2202 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2205 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2206 struct dst_entry *dst;
2209 memset(&fl6, 0, sizeof(fl6));
2210 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2211 fl6.flowi6_oif = oif;
2212 fl6.flowi6_mark = mark;
2213 fl6.daddr = iph->daddr;
2214 fl6.saddr = iph->saddr;
2215 fl6.flowlabel = ip6_flowinfo(iph);
2216 fl6.flowi6_uid = uid;
2218 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2219 rt6_do_redirect(dst, NULL, skb);
2222 EXPORT_SYMBOL_GPL(ip6_redirect);
2224 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2227 const struct ipv6hdr *iph = ipv6_hdr(skb);
2228 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2229 struct dst_entry *dst;
2232 memset(&fl6, 0, sizeof(fl6));
2233 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2234 fl6.flowi6_oif = oif;
2235 fl6.flowi6_mark = mark;
2236 fl6.daddr = msg->dest;
2237 fl6.saddr = iph->daddr;
2238 fl6.flowi6_uid = sock_net_uid(net, NULL);
2240 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2241 rt6_do_redirect(dst, NULL, skb);
2245 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2247 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2250 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2252 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2254 struct net_device *dev = dst->dev;
2255 unsigned int mtu = dst_mtu(dst);
2256 struct net *net = dev_net(dev);
2258 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2260 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2261 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2264 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2265 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2266 * IPV6_MAXPLEN is also valid and means: "any MSS,
2267 * rely only on pmtu discovery"
2269 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2274 static unsigned int ip6_mtu(const struct dst_entry *dst)
2276 const struct rt6_info *rt = (const struct rt6_info *)dst;
2277 unsigned int mtu = rt->rt6i_pmtu;
2278 struct inet6_dev *idev;
2283 mtu = dst_metric_raw(dst, RTAX_MTU);
2290 idev = __in6_dev_get(dst->dev);
2292 mtu = idev->cnf.mtu6;
2296 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2298 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2301 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2304 struct dst_entry *dst;
2305 struct rt6_info *rt;
2306 struct inet6_dev *idev = in6_dev_get(dev);
2307 struct net *net = dev_net(dev);
2309 if (unlikely(!idev))
2310 return ERR_PTR(-ENODEV);
2312 rt = ip6_dst_alloc(net, dev, 0);
2313 if (unlikely(!rt)) {
2315 dst = ERR_PTR(-ENOMEM);
2319 rt->dst.flags |= DST_HOST;
2320 rt->dst.output = ip6_output;
2321 rt->rt6i_gateway = fl6->daddr;
2322 rt->rt6i_dst.addr = fl6->daddr;
2323 rt->rt6i_dst.plen = 128;
2324 rt->rt6i_idev = idev;
2325 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2327 /* Add this dst into uncached_list so that rt6_ifdown() can
2328 * do proper release of the net_device
2330 rt6_uncached_list_add(rt);
2332 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2338 static int ip6_dst_gc(struct dst_ops *ops)
2340 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2341 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2342 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2343 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2344 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2345 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2348 entries = dst_entries_get_fast(ops);
2349 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2350 entries <= rt_max_size)
2353 net->ipv6.ip6_rt_gc_expire++;
2354 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2355 entries = dst_entries_get_slow(ops);
2356 if (entries < ops->gc_thresh)
2357 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2359 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2360 return entries > rt_max_size;
2363 static int ip6_convert_metrics(struct mx6_config *mxc,
2364 const struct fib6_config *cfg)
2366 bool ecn_ca = false;
2374 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2378 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2379 int type = nla_type(nla);
2384 if (unlikely(type > RTAX_MAX))
2387 if (type == RTAX_CC_ALGO) {
2388 char tmp[TCP_CA_NAME_MAX];
2390 nla_strlcpy(tmp, nla, sizeof(tmp));
2391 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2392 if (val == TCP_CA_UNSPEC)
2395 val = nla_get_u32(nla);
2397 if (type == RTAX_HOPLIMIT && val > 255)
2399 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2403 __set_bit(type - 1, mxc->mx_valid);
2407 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2408 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2418 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2419 struct fib6_config *cfg,
2420 const struct in6_addr *gw_addr)
2422 struct flowi6 fl6 = {
2423 .flowi6_oif = cfg->fc_ifindex,
2425 .saddr = cfg->fc_prefsrc,
2427 struct fib6_table *table;
2428 struct rt6_info *rt;
2429 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2431 table = fib6_get_table(net, cfg->fc_table);
2435 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2436 flags |= RT6_LOOKUP_F_HAS_SADDR;
2438 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2440 /* if table lookup failed, fall back to full lookup */
2441 if (rt == net->ipv6.ip6_null_entry) {
2449 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2450 struct netlink_ext_ack *extack)
2452 struct net *net = cfg->fc_nlinfo.nl_net;
2453 struct rt6_info *rt = NULL;
2454 struct net_device *dev = NULL;
2455 struct inet6_dev *idev = NULL;
2456 struct fib6_table *table;
2460 /* RTF_PCPU is an internal flag; can not be set by userspace */
2461 if (cfg->fc_flags & RTF_PCPU) {
2462 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2466 if (cfg->fc_dst_len > 128) {
2467 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2470 if (cfg->fc_src_len > 128) {
2471 NL_SET_ERR_MSG(extack, "Invalid source address length");
2474 #ifndef CONFIG_IPV6_SUBTREES
2475 if (cfg->fc_src_len) {
2476 NL_SET_ERR_MSG(extack,
2477 "Specifying source address requires IPV6_SUBTREES to be enabled");
2481 if (cfg->fc_ifindex) {
2483 dev = dev_get_by_index(net, cfg->fc_ifindex);
2486 idev = in6_dev_get(dev);
2491 if (cfg->fc_metric == 0)
2492 cfg->fc_metric = IP6_RT_PRIO_USER;
2495 if (cfg->fc_nlinfo.nlh &&
2496 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2497 table = fib6_get_table(net, cfg->fc_table);
2499 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2500 table = fib6_new_table(net, cfg->fc_table);
2503 table = fib6_new_table(net, cfg->fc_table);
2509 rt = ip6_dst_alloc(net, NULL,
2510 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2517 if (cfg->fc_flags & RTF_EXPIRES)
2518 rt6_set_expires(rt, jiffies +
2519 clock_t_to_jiffies(cfg->fc_expires));
2521 rt6_clean_expires(rt);
2523 if (cfg->fc_protocol == RTPROT_UNSPEC)
2524 cfg->fc_protocol = RTPROT_BOOT;
2525 rt->rt6i_protocol = cfg->fc_protocol;
2527 addr_type = ipv6_addr_type(&cfg->fc_dst);
2529 if (addr_type & IPV6_ADDR_MULTICAST)
2530 rt->dst.input = ip6_mc_input;
2531 else if (cfg->fc_flags & RTF_LOCAL)
2532 rt->dst.input = ip6_input;
2534 rt->dst.input = ip6_forward;
2536 rt->dst.output = ip6_output;
2538 if (cfg->fc_encap) {
2539 struct lwtunnel_state *lwtstate;
2541 err = lwtunnel_build_state(cfg->fc_encap_type,
2542 cfg->fc_encap, AF_INET6, cfg,
2546 rt->dst.lwtstate = lwtstate_get(lwtstate);
2547 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2548 rt->dst.lwtstate->orig_output = rt->dst.output;
2549 rt->dst.output = lwtunnel_output;
2551 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2552 rt->dst.lwtstate->orig_input = rt->dst.input;
2553 rt->dst.input = lwtunnel_input;
2557 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2558 rt->rt6i_dst.plen = cfg->fc_dst_len;
2559 if (rt->rt6i_dst.plen == 128)
2560 rt->dst.flags |= DST_HOST;
2562 #ifdef CONFIG_IPV6_SUBTREES
2563 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2564 rt->rt6i_src.plen = cfg->fc_src_len;
2567 rt->rt6i_metric = cfg->fc_metric;
2569 /* We cannot add true routes via loopback here,
2570 they would result in kernel looping; promote them to reject routes
2572 if ((cfg->fc_flags & RTF_REJECT) ||
2573 (dev && (dev->flags & IFF_LOOPBACK) &&
2574 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2575 !(cfg->fc_flags & RTF_LOCAL))) {
2576 /* hold loopback dev/idev if we haven't done so. */
2577 if (dev != net->loopback_dev) {
2582 dev = net->loopback_dev;
2584 idev = in6_dev_get(dev);
2590 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2591 switch (cfg->fc_type) {
2593 rt->dst.error = -EINVAL;
2594 rt->dst.output = dst_discard_out;
2595 rt->dst.input = dst_discard;
2598 rt->dst.error = -EACCES;
2599 rt->dst.output = ip6_pkt_prohibit_out;
2600 rt->dst.input = ip6_pkt_prohibit;
2603 case RTN_UNREACHABLE:
2605 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2606 : (cfg->fc_type == RTN_UNREACHABLE)
2607 ? -EHOSTUNREACH : -ENETUNREACH;
2608 rt->dst.output = ip6_pkt_discard_out;
2609 rt->dst.input = ip6_pkt_discard;
2615 if (cfg->fc_flags & RTF_GATEWAY) {
2616 const struct in6_addr *gw_addr;
2619 gw_addr = &cfg->fc_gateway;
2620 gwa_type = ipv6_addr_type(gw_addr);
2622 /* if gw_addr is local we will fail to detect this in case
2623 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2624 * will return already-added prefix route via interface that
2625 * prefix route was assigned to, which might be non-loopback.
2628 if (ipv6_chk_addr_and_flags(net, gw_addr,
2629 gwa_type & IPV6_ADDR_LINKLOCAL ?
2630 dev : NULL, 0, 0)) {
2631 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2634 rt->rt6i_gateway = *gw_addr;
2636 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2637 struct rt6_info *grt = NULL;
2639 /* IPv6 strictly inhibits using not link-local
2640 addresses as nexthop address.
2641 Otherwise, router will not able to send redirects.
2642 It is very good, but in some (rare!) circumstances
2643 (SIT, PtP, NBMA NOARP links) it is handy to allow
2644 some exceptions. --ANK
2645 We allow IPv4-mapped nexthops to support RFC4798-type
2648 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2649 IPV6_ADDR_MAPPED))) {
2650 NL_SET_ERR_MSG(extack,
2651 "Invalid gateway address");
2655 if (cfg->fc_table) {
2656 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2659 if (grt->rt6i_flags & RTF_GATEWAY ||
2660 (dev && dev != grt->dst.dev)) {
2668 grt = rt6_lookup(net, gw_addr, NULL,
2669 cfg->fc_ifindex, 1);
2671 err = -EHOSTUNREACH;
2675 if (dev != grt->dst.dev) {
2681 idev = grt->rt6i_idev;
2683 in6_dev_hold(grt->rt6i_idev);
2685 if (!(grt->rt6i_flags & RTF_GATEWAY))
2694 NL_SET_ERR_MSG(extack, "Egress device not specified");
2696 } else if (dev->flags & IFF_LOOPBACK) {
2697 NL_SET_ERR_MSG(extack,
2698 "Egress device can not be loopback device for this route");
2707 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2708 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2709 NL_SET_ERR_MSG(extack, "Invalid source address");
2713 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2714 rt->rt6i_prefsrc.plen = 128;
2716 rt->rt6i_prefsrc.plen = 0;
2718 rt->rt6i_flags = cfg->fc_flags;
2722 rt->rt6i_idev = idev;
2723 rt->rt6i_table = table;
2725 cfg->fc_nlinfo.nl_net = dev_net(dev);
2734 dst_release_immediate(&rt->dst);
2736 return ERR_PTR(err);
2739 int ip6_route_add(struct fib6_config *cfg,
2740 struct netlink_ext_ack *extack)
2742 struct mx6_config mxc = { .mx = NULL, };
2743 struct rt6_info *rt;
2746 rt = ip6_route_info_create(cfg, extack);
2753 err = ip6_convert_metrics(&mxc, cfg);
2757 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2764 dst_release_immediate(&rt->dst);
2769 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2772 struct fib6_table *table;
2773 struct net *net = dev_net(rt->dst.dev);
2775 if (rt == net->ipv6.ip6_null_entry) {
2780 table = rt->rt6i_table;
2781 write_lock_bh(&table->tb6_lock);
2782 err = fib6_del(rt, info);
2783 write_unlock_bh(&table->tb6_lock);
2790 int ip6_del_rt(struct rt6_info *rt)
2792 struct nl_info info = {
2793 .nl_net = dev_net(rt->dst.dev),
2795 return __ip6_del_rt(rt, &info);
2798 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2800 struct nl_info *info = &cfg->fc_nlinfo;
2801 struct net *net = info->nl_net;
2802 struct sk_buff *skb = NULL;
2803 struct fib6_table *table;
2806 if (rt == net->ipv6.ip6_null_entry)
2808 table = rt->rt6i_table;
2809 write_lock_bh(&table->tb6_lock);
2811 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2812 struct rt6_info *sibling, *next_sibling;
2814 /* prefer to send a single notification with all hops */
2815 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2817 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2819 if (rt6_fill_node(net, skb, rt,
2820 NULL, NULL, 0, RTM_DELROUTE,
2821 info->portid, seq, 0) < 0) {
2825 info->skip_notify = 1;
2828 list_for_each_entry_safe(sibling, next_sibling,
2831 err = fib6_del(sibling, info);
2837 err = fib6_del(rt, info);
2839 write_unlock_bh(&table->tb6_lock);
2844 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2845 info->nlh, gfp_any());
2850 static int ip6_route_del(struct fib6_config *cfg,
2851 struct netlink_ext_ack *extack)
2853 struct rt6_info *rt, *rt_cache;
2854 struct fib6_table *table;
2855 struct fib6_node *fn;
2858 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2860 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2864 read_lock_bh(&table->tb6_lock);
2866 fn = fib6_locate(&table->tb6_root,
2867 &cfg->fc_dst, cfg->fc_dst_len,
2868 &cfg->fc_src, cfg->fc_src_len,
2869 !(cfg->fc_flags & RTF_CACHE));
2872 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2873 if (cfg->fc_flags & RTF_CACHE) {
2874 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2880 if (cfg->fc_ifindex &&
2882 rt->dst.dev->ifindex != cfg->fc_ifindex))
2884 if (cfg->fc_flags & RTF_GATEWAY &&
2885 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2887 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2889 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2891 if (!dst_hold_safe(&rt->dst))
2893 read_unlock_bh(&table->tb6_lock);
2895 /* if gateway was specified only delete the one hop */
2896 if (cfg->fc_flags & RTF_GATEWAY)
2897 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2899 return __ip6_del_rt_siblings(rt, cfg);
2902 read_unlock_bh(&table->tb6_lock);
2907 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2909 struct netevent_redirect netevent;
2910 struct rt6_info *rt, *nrt = NULL;
2911 struct ndisc_options ndopts;
2912 struct inet6_dev *in6_dev;
2913 struct neighbour *neigh;
2915 int optlen, on_link;
2918 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2919 optlen -= sizeof(*msg);
2922 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2926 msg = (struct rd_msg *)icmp6_hdr(skb);
2928 if (ipv6_addr_is_multicast(&msg->dest)) {
2929 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2934 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2936 } else if (ipv6_addr_type(&msg->target) !=
2937 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2938 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2942 in6_dev = __in6_dev_get(skb->dev);
2945 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2949 * The IP source address of the Redirect MUST be the same as the current
2950 * first-hop router for the specified ICMP Destination Address.
2953 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2954 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2959 if (ndopts.nd_opts_tgt_lladdr) {
2960 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2963 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2968 rt = (struct rt6_info *) dst;
2969 if (rt->rt6i_flags & RTF_REJECT) {
2970 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2974 /* Redirect received -> path was valid.
2975 * Look, redirects are sent only in response to data packets,
2976 * so that this nexthop apparently is reachable. --ANK
2978 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2980 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2985 * We have finally decided to accept it.
2988 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2989 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2990 NEIGH_UPDATE_F_OVERRIDE|
2991 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2992 NEIGH_UPDATE_F_ISROUTER)),
2993 NDISC_REDIRECT, &ndopts);
2995 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2999 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3001 nrt->rt6i_flags &= ~RTF_GATEWAY;
3003 nrt->rt6i_protocol = RTPROT_REDIRECT;
3004 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3006 /* No need to remove rt from the exception table if rt is
3007 * a cached route because rt6_insert_exception() will
3010 if (rt6_insert_exception(nrt, rt)) {
3011 dst_release_immediate(&nrt->dst);
3015 netevent.old = &rt->dst;
3016 netevent.new = &nrt->dst;
3017 netevent.daddr = &msg->dest;
3018 netevent.neigh = neigh;
3019 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3022 neigh_release(neigh);
3026 * Misc support functions
3029 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3031 BUG_ON(from->dst.from);
3033 rt->rt6i_flags &= ~RTF_EXPIRES;
3034 dst_hold(&from->dst);
3035 rt->dst.from = &from->dst;
3036 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3039 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3041 rt->dst.input = ort->dst.input;
3042 rt->dst.output = ort->dst.output;
3043 rt->rt6i_dst = ort->rt6i_dst;
3044 rt->dst.error = ort->dst.error;
3045 rt->rt6i_idev = ort->rt6i_idev;
3047 in6_dev_hold(rt->rt6i_idev);
3048 rt->dst.lastuse = jiffies;
3049 rt->rt6i_gateway = ort->rt6i_gateway;
3050 rt->rt6i_flags = ort->rt6i_flags;
3051 rt6_set_from(rt, ort);
3052 rt->rt6i_metric = ort->rt6i_metric;
3053 #ifdef CONFIG_IPV6_SUBTREES
3054 rt->rt6i_src = ort->rt6i_src;
3056 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3057 rt->rt6i_table = ort->rt6i_table;
3058 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3061 #ifdef CONFIG_IPV6_ROUTE_INFO
3062 static struct rt6_info *rt6_get_route_info(struct net *net,
3063 const struct in6_addr *prefix, int prefixlen,
3064 const struct in6_addr *gwaddr,
3065 struct net_device *dev)
3067 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3068 int ifindex = dev->ifindex;
3069 struct fib6_node *fn;
3070 struct rt6_info *rt = NULL;
3071 struct fib6_table *table;
3073 table = fib6_get_table(net, tb_id);
3077 read_lock_bh(&table->tb6_lock);
3078 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3082 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3083 if (rt->dst.dev->ifindex != ifindex)
3085 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3087 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3089 ip6_hold_safe(NULL, &rt, false);
3093 read_unlock_bh(&table->tb6_lock);
3097 static struct rt6_info *rt6_add_route_info(struct net *net,
3098 const struct in6_addr *prefix, int prefixlen,
3099 const struct in6_addr *gwaddr,
3100 struct net_device *dev,
3103 struct fib6_config cfg = {
3104 .fc_metric = IP6_RT_PRIO_USER,
3105 .fc_ifindex = dev->ifindex,
3106 .fc_dst_len = prefixlen,
3107 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3108 RTF_UP | RTF_PREF(pref),
3109 .fc_protocol = RTPROT_RA,
3110 .fc_nlinfo.portid = 0,
3111 .fc_nlinfo.nlh = NULL,
3112 .fc_nlinfo.nl_net = net,
3115 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3116 cfg.fc_dst = *prefix;
3117 cfg.fc_gateway = *gwaddr;
3119 /* We should treat it as a default route if prefix length is 0. */
3121 cfg.fc_flags |= RTF_DEFAULT;
3123 ip6_route_add(&cfg, NULL);
3125 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3129 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3131 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3132 struct rt6_info *rt;
3133 struct fib6_table *table;
3135 table = fib6_get_table(dev_net(dev), tb_id);
3139 read_lock_bh(&table->tb6_lock);
3140 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3141 if (dev == rt->dst.dev &&
3142 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3143 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3147 ip6_hold_safe(NULL, &rt, false);
3148 read_unlock_bh(&table->tb6_lock);
3152 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3153 struct net_device *dev,
3156 struct fib6_config cfg = {
3157 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3158 .fc_metric = IP6_RT_PRIO_USER,
3159 .fc_ifindex = dev->ifindex,
3160 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3161 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3162 .fc_protocol = RTPROT_RA,
3163 .fc_nlinfo.portid = 0,
3164 .fc_nlinfo.nlh = NULL,
3165 .fc_nlinfo.nl_net = dev_net(dev),
3168 cfg.fc_gateway = *gwaddr;
3170 if (!ip6_route_add(&cfg, NULL)) {
3171 struct fib6_table *table;
3173 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3175 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3178 return rt6_get_dflt_router(gwaddr, dev);
3181 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3183 struct rt6_info *rt;
3186 read_lock_bh(&table->tb6_lock);
3187 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3188 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3189 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3190 if (dst_hold_safe(&rt->dst)) {
3191 read_unlock_bh(&table->tb6_lock);
3194 read_unlock_bh(&table->tb6_lock);
3199 read_unlock_bh(&table->tb6_lock);
3201 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3204 void rt6_purge_dflt_routers(struct net *net)
3206 struct fib6_table *table;
3207 struct hlist_head *head;
3212 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3213 head = &net->ipv6.fib_table_hash[h];
3214 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3215 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3216 __rt6_purge_dflt_routers(table);
3223 static void rtmsg_to_fib6_config(struct net *net,
3224 struct in6_rtmsg *rtmsg,
3225 struct fib6_config *cfg)
3227 memset(cfg, 0, sizeof(*cfg));
3229 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3231 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3232 cfg->fc_metric = rtmsg->rtmsg_metric;
3233 cfg->fc_expires = rtmsg->rtmsg_info;
3234 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3235 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3236 cfg->fc_flags = rtmsg->rtmsg_flags;
3238 cfg->fc_nlinfo.nl_net = net;
3240 cfg->fc_dst = rtmsg->rtmsg_dst;
3241 cfg->fc_src = rtmsg->rtmsg_src;
3242 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3245 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3247 struct fib6_config cfg;
3248 struct in6_rtmsg rtmsg;
3252 case SIOCADDRT: /* Add a route */
3253 case SIOCDELRT: /* Delete a route */
3254 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3256 err = copy_from_user(&rtmsg, arg,
3257 sizeof(struct in6_rtmsg));
3261 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3266 err = ip6_route_add(&cfg, NULL);
3269 err = ip6_route_del(&cfg, NULL);
3283 * Drop the packet on the floor
3286 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3289 struct dst_entry *dst = skb_dst(skb);
3290 switch (ipstats_mib_noroutes) {
3291 case IPSTATS_MIB_INNOROUTES:
3292 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3293 if (type == IPV6_ADDR_ANY) {
3294 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3295 IPSTATS_MIB_INADDRERRORS);
3299 case IPSTATS_MIB_OUTNOROUTES:
3300 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3301 ipstats_mib_noroutes);
3304 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3309 static int ip6_pkt_discard(struct sk_buff *skb)
3311 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3314 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3316 skb->dev = skb_dst(skb)->dev;
3317 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3320 static int ip6_pkt_prohibit(struct sk_buff *skb)
3322 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3325 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3327 skb->dev = skb_dst(skb)->dev;
3328 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3332 * Allocate a dst for local (unicast / anycast) address.
3335 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3336 const struct in6_addr *addr,
3340 struct net *net = dev_net(idev->dev);
3341 struct net_device *dev = idev->dev;
3342 struct rt6_info *rt;
3344 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3346 return ERR_PTR(-ENOMEM);
3350 rt->dst.flags |= DST_HOST;
3351 rt->dst.input = ip6_input;
3352 rt->dst.output = ip6_output;
3353 rt->rt6i_idev = idev;
3355 rt->rt6i_protocol = RTPROT_KERNEL;
3356 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3358 rt->rt6i_flags |= RTF_ANYCAST;
3360 rt->rt6i_flags |= RTF_LOCAL;
3362 rt->rt6i_gateway = *addr;
3363 rt->rt6i_dst.addr = *addr;
3364 rt->rt6i_dst.plen = 128;
3365 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3366 rt->rt6i_table = fib6_get_table(net, tb_id);
3371 /* remove deleted ip from prefsrc entries */
3372 struct arg_dev_net_ip {
3373 struct net_device *dev;
3375 struct in6_addr *addr;
3378 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3380 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3381 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3382 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3384 if (((void *)rt->dst.dev == dev || !dev) &&
3385 rt != net->ipv6.ip6_null_entry &&
3386 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3387 spin_lock_bh(&rt6_exception_lock);
3388 /* remove prefsrc entry */
3389 rt->rt6i_prefsrc.plen = 0;
3390 /* need to update cache as well */
3391 rt6_exceptions_remove_prefsrc(rt);
3392 spin_unlock_bh(&rt6_exception_lock);
3397 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3399 struct net *net = dev_net(ifp->idev->dev);
3400 struct arg_dev_net_ip adni = {
3401 .dev = ifp->idev->dev,
3405 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3408 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3410 /* Remove routers and update dst entries when gateway turn into host. */
3411 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3413 struct in6_addr *gateway = (struct in6_addr *)arg;
3415 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3416 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3420 /* Further clean up cached routes in exception table.
3421 * This is needed because cached route may have a different
3422 * gateway than its 'parent' in the case of an ip redirect.
3424 rt6_exceptions_clean_tohost(rt, gateway);
3429 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3431 fib6_clean_all(net, fib6_clean_tohost, gateway);
3434 struct arg_dev_net {
3435 struct net_device *dev;
3439 /* called with write lock held for table with rt */
3440 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3442 const struct arg_dev_net *adn = arg;
3443 const struct net_device *dev = adn->dev;
3445 if ((rt->dst.dev == dev || !dev) &&
3446 rt != adn->net->ipv6.ip6_null_entry &&
3447 (rt->rt6i_nsiblings == 0 ||
3448 (dev && netdev_unregistering(dev)) ||
3449 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3455 void rt6_ifdown(struct net *net, struct net_device *dev)
3457 struct arg_dev_net adn = {
3462 fib6_clean_all(net, fib6_ifdown, &adn);
3464 rt6_uncached_list_flush_dev(net, dev);
3467 struct rt6_mtu_change_arg {
3468 struct net_device *dev;
3472 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3474 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3475 struct inet6_dev *idev;
3477 /* In IPv6 pmtu discovery is not optional,
3478 so that RTAX_MTU lock cannot disable it.
3479 We still use this lock to block changes
3480 caused by addrconf/ndisc.
3483 idev = __in6_dev_get(arg->dev);
3487 /* For administrative MTU increase, there is no way to discover
3488 IPv6 PMTU increase, so PMTU increase should be updated here.
3489 Since RFC 1981 doesn't include administrative MTU increase
3490 update PMTU increase is a MUST. (i.e. jumbo frame)
3493 If new MTU is less than route PMTU, this new MTU will be the
3494 lowest MTU in the path, update the route PMTU to reflect PMTU
3495 decreases; if new MTU is greater than route PMTU, and the
3496 old MTU is the lowest MTU in the path, update the route PMTU
3497 to reflect the increase. In this case if the other nodes' MTU
3498 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3501 if (rt->dst.dev == arg->dev &&
3502 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3503 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3504 spin_lock_bh(&rt6_exception_lock);
3505 if (dst_mtu(&rt->dst) >= arg->mtu ||
3506 (dst_mtu(&rt->dst) < arg->mtu &&
3507 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3508 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3510 rt6_exceptions_update_pmtu(rt, arg->mtu);
3511 spin_unlock_bh(&rt6_exception_lock);
3516 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3518 struct rt6_mtu_change_arg arg = {
3523 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3526 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3527 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3528 [RTA_OIF] = { .type = NLA_U32 },
3529 [RTA_IIF] = { .type = NLA_U32 },
3530 [RTA_PRIORITY] = { .type = NLA_U32 },
3531 [RTA_METRICS] = { .type = NLA_NESTED },
3532 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3533 [RTA_PREF] = { .type = NLA_U8 },
3534 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3535 [RTA_ENCAP] = { .type = NLA_NESTED },
3536 [RTA_EXPIRES] = { .type = NLA_U32 },
3537 [RTA_UID] = { .type = NLA_U32 },
3538 [RTA_MARK] = { .type = NLA_U32 },
3541 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3542 struct fib6_config *cfg,
3543 struct netlink_ext_ack *extack)
3546 struct nlattr *tb[RTA_MAX+1];
3550 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3556 rtm = nlmsg_data(nlh);
3557 memset(cfg, 0, sizeof(*cfg));
3559 cfg->fc_table = rtm->rtm_table;
3560 cfg->fc_dst_len = rtm->rtm_dst_len;
3561 cfg->fc_src_len = rtm->rtm_src_len;
3562 cfg->fc_flags = RTF_UP;
3563 cfg->fc_protocol = rtm->rtm_protocol;
3564 cfg->fc_type = rtm->rtm_type;
3566 if (rtm->rtm_type == RTN_UNREACHABLE ||
3567 rtm->rtm_type == RTN_BLACKHOLE ||
3568 rtm->rtm_type == RTN_PROHIBIT ||
3569 rtm->rtm_type == RTN_THROW)
3570 cfg->fc_flags |= RTF_REJECT;
3572 if (rtm->rtm_type == RTN_LOCAL)
3573 cfg->fc_flags |= RTF_LOCAL;
3575 if (rtm->rtm_flags & RTM_F_CLONED)
3576 cfg->fc_flags |= RTF_CACHE;
3578 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3579 cfg->fc_nlinfo.nlh = nlh;
3580 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3582 if (tb[RTA_GATEWAY]) {
3583 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3584 cfg->fc_flags |= RTF_GATEWAY;
3588 int plen = (rtm->rtm_dst_len + 7) >> 3;
3590 if (nla_len(tb[RTA_DST]) < plen)
3593 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3597 int plen = (rtm->rtm_src_len + 7) >> 3;
3599 if (nla_len(tb[RTA_SRC]) < plen)
3602 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3605 if (tb[RTA_PREFSRC])
3606 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3609 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3611 if (tb[RTA_PRIORITY])
3612 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3614 if (tb[RTA_METRICS]) {
3615 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3616 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3620 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3622 if (tb[RTA_MULTIPATH]) {
3623 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3624 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3626 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3627 cfg->fc_mp_len, extack);
3633 pref = nla_get_u8(tb[RTA_PREF]);
3634 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3635 pref != ICMPV6_ROUTER_PREF_HIGH)
3636 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3637 cfg->fc_flags |= RTF_PREF(pref);
3641 cfg->fc_encap = tb[RTA_ENCAP];
3643 if (tb[RTA_ENCAP_TYPE]) {
3644 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3646 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3651 if (tb[RTA_EXPIRES]) {
3652 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3654 if (addrconf_finite_timeout(timeout)) {
3655 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3656 cfg->fc_flags |= RTF_EXPIRES;
3666 struct rt6_info *rt6_info;
3667 struct fib6_config r_cfg;
3668 struct mx6_config mxc;
3669 struct list_head next;
3672 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3676 list_for_each_entry(nh, rt6_nh_list, next) {
3677 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3678 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3679 nh->r_cfg.fc_ifindex);
3683 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3684 struct rt6_info *rt, struct fib6_config *r_cfg)
3689 list_for_each_entry(nh, rt6_nh_list, next) {
3690 /* check if rt6_info already exists */
3691 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3695 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3699 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3704 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3705 list_add_tail(&nh->next, rt6_nh_list);
3710 static void ip6_route_mpath_notify(struct rt6_info *rt,
3711 struct rt6_info *rt_last,
3712 struct nl_info *info,
3715 /* if this is an APPEND route, then rt points to the first route
3716 * inserted and rt_last points to last route inserted. Userspace
3717 * wants a consistent dump of the route which starts at the first
3718 * nexthop. Since sibling routes are always added at the end of
3719 * the list, find the first sibling of the last route appended
3721 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3722 rt = list_first_entry(&rt_last->rt6i_siblings,
3728 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3731 static int ip6_route_multipath_add(struct fib6_config *cfg,
3732 struct netlink_ext_ack *extack)
3734 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3735 struct nl_info *info = &cfg->fc_nlinfo;
3736 struct fib6_config r_cfg;
3737 struct rtnexthop *rtnh;
3738 struct rt6_info *rt;
3739 struct rt6_nh *err_nh;
3740 struct rt6_nh *nh, *nh_safe;
3746 int replace = (cfg->fc_nlinfo.nlh &&
3747 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3748 LIST_HEAD(rt6_nh_list);
3750 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3751 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3752 nlflags |= NLM_F_APPEND;
3754 remaining = cfg->fc_mp_len;
3755 rtnh = (struct rtnexthop *)cfg->fc_mp;
3757 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3758 * rt6_info structs per nexthop
3760 while (rtnh_ok(rtnh, remaining)) {
3761 memcpy(&r_cfg, cfg, sizeof(*cfg));
3762 if (rtnh->rtnh_ifindex)
3763 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3765 attrlen = rtnh_attrlen(rtnh);
3767 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3769 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3771 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3772 r_cfg.fc_flags |= RTF_GATEWAY;
3774 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3775 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3777 r_cfg.fc_encap_type = nla_get_u16(nla);
3780 rt = ip6_route_info_create(&r_cfg, extack);
3787 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3789 dst_release_immediate(&rt->dst);
3793 rtnh = rtnh_next(rtnh, &remaining);
3796 /* for add and replace send one notification with all nexthops.
3797 * Skip the notification in fib6_add_rt2node and send one with
3798 * the full route when done
3800 info->skip_notify = 1;
3803 list_for_each_entry(nh, &rt6_nh_list, next) {
3804 rt_last = nh->rt6_info;
3805 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3806 /* save reference to first route for notification */
3807 if (!rt_notif && !err)
3808 rt_notif = nh->rt6_info;
3810 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3811 nh->rt6_info = NULL;
3814 ip6_print_replace_route_err(&rt6_nh_list);
3819 /* Because each route is added like a single route we remove
3820 * these flags after the first nexthop: if there is a collision,
3821 * we have already failed to add the first nexthop:
3822 * fib6_add_rt2node() has rejected it; when replacing, old
3823 * nexthops have been replaced by first new, the rest should
3826 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3831 /* success ... tell user about new route */
3832 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3836 /* send notification for routes that were added so that
3837 * the delete notifications sent by ip6_route_del are
3841 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3843 /* Delete routes that were already added */
3844 list_for_each_entry(nh, &rt6_nh_list, next) {
3847 ip6_route_del(&nh->r_cfg, extack);
3851 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3853 dst_release_immediate(&nh->rt6_info->dst);
3855 list_del(&nh->next);
3862 static int ip6_route_multipath_del(struct fib6_config *cfg,
3863 struct netlink_ext_ack *extack)
3865 struct fib6_config r_cfg;
3866 struct rtnexthop *rtnh;
3869 int err = 1, last_err = 0;
3871 remaining = cfg->fc_mp_len;
3872 rtnh = (struct rtnexthop *)cfg->fc_mp;
3874 /* Parse a Multipath Entry */
3875 while (rtnh_ok(rtnh, remaining)) {
3876 memcpy(&r_cfg, cfg, sizeof(*cfg));
3877 if (rtnh->rtnh_ifindex)
3878 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3880 attrlen = rtnh_attrlen(rtnh);
3882 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3884 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3886 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3887 r_cfg.fc_flags |= RTF_GATEWAY;
3890 err = ip6_route_del(&r_cfg, extack);
3894 rtnh = rtnh_next(rtnh, &remaining);
3900 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3901 struct netlink_ext_ack *extack)
3903 struct fib6_config cfg;
3906 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3911 return ip6_route_multipath_del(&cfg, extack);
3913 cfg.fc_delete_all_nh = 1;
3914 return ip6_route_del(&cfg, extack);
3918 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3919 struct netlink_ext_ack *extack)
3921 struct fib6_config cfg;
3924 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3929 return ip6_route_multipath_add(&cfg, extack);
3931 return ip6_route_add(&cfg, extack);
3934 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3936 int nexthop_len = 0;
3938 if (rt->rt6i_nsiblings) {
3939 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3940 + NLA_ALIGN(sizeof(struct rtnexthop))
3941 + nla_total_size(16) /* RTA_GATEWAY */
3942 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3944 nexthop_len *= rt->rt6i_nsiblings;
3947 return NLMSG_ALIGN(sizeof(struct rtmsg))
3948 + nla_total_size(16) /* RTA_SRC */
3949 + nla_total_size(16) /* RTA_DST */
3950 + nla_total_size(16) /* RTA_GATEWAY */
3951 + nla_total_size(16) /* RTA_PREFSRC */
3952 + nla_total_size(4) /* RTA_TABLE */
3953 + nla_total_size(4) /* RTA_IIF */
3954 + nla_total_size(4) /* RTA_OIF */
3955 + nla_total_size(4) /* RTA_PRIORITY */
3956 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3957 + nla_total_size(sizeof(struct rta_cacheinfo))
3958 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3959 + nla_total_size(1) /* RTA_PREF */
3960 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3964 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3965 unsigned int *flags, bool skip_oif)
3967 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3968 *flags |= RTNH_F_LINKDOWN;
3969 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3970 *flags |= RTNH_F_DEAD;
3973 if (rt->rt6i_flags & RTF_GATEWAY) {
3974 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3975 goto nla_put_failure;
3978 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3979 *flags |= RTNH_F_OFFLOAD;
3981 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3982 if (!skip_oif && rt->dst.dev &&
3983 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3984 goto nla_put_failure;
3986 if (rt->dst.lwtstate &&
3987 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3988 goto nla_put_failure;
3996 /* add multipath next hop */
3997 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3999 struct rtnexthop *rtnh;
4000 unsigned int flags = 0;
4002 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4004 goto nla_put_failure;
4006 rtnh->rtnh_hops = 0;
4007 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4009 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4010 goto nla_put_failure;
4012 rtnh->rtnh_flags = flags;
4014 /* length of rtnetlink header + attributes */
4015 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4023 static int rt6_fill_node(struct net *net,
4024 struct sk_buff *skb, struct rt6_info *rt,
4025 struct in6_addr *dst, struct in6_addr *src,
4026 int iif, int type, u32 portid, u32 seq,
4029 u32 metrics[RTAX_MAX];
4031 struct nlmsghdr *nlh;
4035 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4039 rtm = nlmsg_data(nlh);
4040 rtm->rtm_family = AF_INET6;
4041 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4042 rtm->rtm_src_len = rt->rt6i_src.plen;
4045 table = rt->rt6i_table->tb6_id;
4047 table = RT6_TABLE_UNSPEC;
4048 rtm->rtm_table = table;
4049 if (nla_put_u32(skb, RTA_TABLE, table))
4050 goto nla_put_failure;
4051 if (rt->rt6i_flags & RTF_REJECT) {
4052 switch (rt->dst.error) {
4054 rtm->rtm_type = RTN_BLACKHOLE;
4057 rtm->rtm_type = RTN_PROHIBIT;
4060 rtm->rtm_type = RTN_THROW;
4063 rtm->rtm_type = RTN_UNREACHABLE;
4067 else if (rt->rt6i_flags & RTF_LOCAL)
4068 rtm->rtm_type = RTN_LOCAL;
4069 else if (rt->rt6i_flags & RTF_ANYCAST)
4070 rtm->rtm_type = RTN_ANYCAST;
4071 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4072 rtm->rtm_type = RTN_LOCAL;
4074 rtm->rtm_type = RTN_UNICAST;
4076 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4077 rtm->rtm_protocol = rt->rt6i_protocol;
4079 if (rt->rt6i_flags & RTF_CACHE)
4080 rtm->rtm_flags |= RTM_F_CLONED;
4083 if (nla_put_in6_addr(skb, RTA_DST, dst))
4084 goto nla_put_failure;
4085 rtm->rtm_dst_len = 128;
4086 } else if (rtm->rtm_dst_len)
4087 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4088 goto nla_put_failure;
4089 #ifdef CONFIG_IPV6_SUBTREES
4091 if (nla_put_in6_addr(skb, RTA_SRC, src))
4092 goto nla_put_failure;
4093 rtm->rtm_src_len = 128;
4094 } else if (rtm->rtm_src_len &&
4095 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4096 goto nla_put_failure;
4099 #ifdef CONFIG_IPV6_MROUTE
4100 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4101 int err = ip6mr_get_route(net, skb, rtm, portid);
4106 goto nla_put_failure;
4109 if (nla_put_u32(skb, RTA_IIF, iif))
4110 goto nla_put_failure;
4112 struct in6_addr saddr_buf;
4113 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4114 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4115 goto nla_put_failure;
4118 if (rt->rt6i_prefsrc.plen) {
4119 struct in6_addr saddr_buf;
4120 saddr_buf = rt->rt6i_prefsrc.addr;
4121 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4122 goto nla_put_failure;
4125 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4127 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4128 if (rtnetlink_put_metrics(skb, metrics) < 0)
4129 goto nla_put_failure;
4131 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4132 goto nla_put_failure;
4134 /* For multipath routes, walk the siblings list and add
4135 * each as a nexthop within RTA_MULTIPATH.
4137 if (rt->rt6i_nsiblings) {
4138 struct rt6_info *sibling, *next_sibling;
4141 mp = nla_nest_start(skb, RTA_MULTIPATH);
4143 goto nla_put_failure;
4145 if (rt6_add_nexthop(skb, rt) < 0)
4146 goto nla_put_failure;
4148 list_for_each_entry_safe(sibling, next_sibling,
4149 &rt->rt6i_siblings, rt6i_siblings) {
4150 if (rt6_add_nexthop(skb, sibling) < 0)
4151 goto nla_put_failure;
4154 nla_nest_end(skb, mp);
4156 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4157 goto nla_put_failure;
4160 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4162 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4163 goto nla_put_failure;
4165 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4166 goto nla_put_failure;
4169 nlmsg_end(skb, nlh);
4173 nlmsg_cancel(skb, nlh);
4177 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4179 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4180 struct net *net = arg->net;
4182 if (rt == net->ipv6.ip6_null_entry)
4185 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4186 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4188 /* user wants prefix routes only */
4189 if (rtm->rtm_flags & RTM_F_PREFIX &&
4190 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4191 /* success since this is not a prefix route */
4196 return rt6_fill_node(net,
4197 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4198 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4202 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4203 struct netlink_ext_ack *extack)
4205 struct net *net = sock_net(in_skb->sk);
4206 struct nlattr *tb[RTA_MAX+1];
4207 int err, iif = 0, oif = 0;
4208 struct dst_entry *dst;
4209 struct rt6_info *rt;
4210 struct sk_buff *skb;
4215 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4221 memset(&fl6, 0, sizeof(fl6));
4222 rtm = nlmsg_data(nlh);
4223 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4224 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4227 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4230 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4234 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4237 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4241 iif = nla_get_u32(tb[RTA_IIF]);
4244 oif = nla_get_u32(tb[RTA_OIF]);
4247 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4250 fl6.flowi6_uid = make_kuid(current_user_ns(),
4251 nla_get_u32(tb[RTA_UID]));
4253 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4256 struct net_device *dev;
4261 dev = dev_get_by_index_rcu(net, iif);
4268 fl6.flowi6_iif = iif;
4270 if (!ipv6_addr_any(&fl6.saddr))
4271 flags |= RT6_LOOKUP_F_HAS_SADDR;
4274 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4276 dst = ip6_route_lookup(net, &fl6, 0);
4280 fl6.flowi6_oif = oif;
4283 dst = ip6_route_output(net, NULL, &fl6);
4285 dst = ip6_route_lookup(net, &fl6, 0);
4289 rt = container_of(dst, struct rt6_info, dst);
4290 if (rt->dst.error) {
4291 err = rt->dst.error;
4296 if (rt == net->ipv6.ip6_null_entry) {
4297 err = rt->dst.error;
4302 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4309 skb_dst_set(skb, &rt->dst);
4311 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4312 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4315 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4316 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4323 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4328 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4329 unsigned int nlm_flags)
4331 struct sk_buff *skb;
4332 struct net *net = info->nl_net;
4337 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4339 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4343 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4344 event, info->portid, seq, nlm_flags);
4346 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4347 WARN_ON(err == -EMSGSIZE);
4351 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4352 info->nlh, gfp_any());
4356 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4359 static int ip6_route_dev_notify(struct notifier_block *this,
4360 unsigned long event, void *ptr)
4362 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4363 struct net *net = dev_net(dev);
4365 if (!(dev->flags & IFF_LOOPBACK))
4368 if (event == NETDEV_REGISTER) {
4369 net->ipv6.ip6_null_entry->dst.dev = dev;
4370 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4371 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4372 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4373 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4374 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4375 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4377 } else if (event == NETDEV_UNREGISTER &&
4378 dev->reg_state != NETREG_UNREGISTERED) {
4379 /* NETDEV_UNREGISTER could be fired for multiple times by
4380 * netdev_wait_allrefs(). Make sure we only call this once.
4382 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4383 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4384 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4385 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4396 #ifdef CONFIG_PROC_FS
4398 static const struct file_operations ipv6_route_proc_fops = {
4399 .owner = THIS_MODULE,
4400 .open = ipv6_route_open,
4402 .llseek = seq_lseek,
4403 .release = seq_release_net,
4406 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4408 struct net *net = (struct net *)seq->private;
4409 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4410 net->ipv6.rt6_stats->fib_nodes,
4411 net->ipv6.rt6_stats->fib_route_nodes,
4412 net->ipv6.rt6_stats->fib_rt_alloc,
4413 net->ipv6.rt6_stats->fib_rt_entries,
4414 net->ipv6.rt6_stats->fib_rt_cache,
4415 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4416 net->ipv6.rt6_stats->fib_discarded_routes);
4421 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4423 return single_open_net(inode, file, rt6_stats_seq_show);
4426 static const struct file_operations rt6_stats_seq_fops = {
4427 .owner = THIS_MODULE,
4428 .open = rt6_stats_seq_open,
4430 .llseek = seq_lseek,
4431 .release = single_release_net,
4433 #endif /* CONFIG_PROC_FS */
4435 #ifdef CONFIG_SYSCTL
4438 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4439 void __user *buffer, size_t *lenp, loff_t *ppos)
4446 net = (struct net *)ctl->extra1;
4447 delay = net->ipv6.sysctl.flush_delay;
4448 proc_dointvec(ctl, write, buffer, lenp, ppos);
4449 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4453 struct ctl_table ipv6_route_table_template[] = {
4455 .procname = "flush",
4456 .data = &init_net.ipv6.sysctl.flush_delay,
4457 .maxlen = sizeof(int),
4459 .proc_handler = ipv6_sysctl_rtcache_flush
4462 .procname = "gc_thresh",
4463 .data = &ip6_dst_ops_template.gc_thresh,
4464 .maxlen = sizeof(int),
4466 .proc_handler = proc_dointvec,
4469 .procname = "max_size",
4470 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4471 .maxlen = sizeof(int),
4473 .proc_handler = proc_dointvec,
4476 .procname = "gc_min_interval",
4477 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4478 .maxlen = sizeof(int),
4480 .proc_handler = proc_dointvec_jiffies,
4483 .procname = "gc_timeout",
4484 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4485 .maxlen = sizeof(int),
4487 .proc_handler = proc_dointvec_jiffies,
4490 .procname = "gc_interval",
4491 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4492 .maxlen = sizeof(int),
4494 .proc_handler = proc_dointvec_jiffies,
4497 .procname = "gc_elasticity",
4498 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4499 .maxlen = sizeof(int),
4501 .proc_handler = proc_dointvec,
4504 .procname = "mtu_expires",
4505 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4506 .maxlen = sizeof(int),
4508 .proc_handler = proc_dointvec_jiffies,
4511 .procname = "min_adv_mss",
4512 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4513 .maxlen = sizeof(int),
4515 .proc_handler = proc_dointvec,
4518 .procname = "gc_min_interval_ms",
4519 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4520 .maxlen = sizeof(int),
4522 .proc_handler = proc_dointvec_ms_jiffies,
4527 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4529 struct ctl_table *table;
4531 table = kmemdup(ipv6_route_table_template,
4532 sizeof(ipv6_route_table_template),
4536 table[0].data = &net->ipv6.sysctl.flush_delay;
4537 table[0].extra1 = net;
4538 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4539 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4540 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4541 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4542 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4543 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4544 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4545 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4546 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4548 /* Don't export sysctls to unprivileged users */
4549 if (net->user_ns != &init_user_ns)
4550 table[0].procname = NULL;
4557 static int __net_init ip6_route_net_init(struct net *net)
4561 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4562 sizeof(net->ipv6.ip6_dst_ops));
4564 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4565 goto out_ip6_dst_ops;
4567 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4568 sizeof(*net->ipv6.ip6_null_entry),
4570 if (!net->ipv6.ip6_null_entry)
4571 goto out_ip6_dst_entries;
4572 net->ipv6.ip6_null_entry->dst.path =
4573 (struct dst_entry *)net->ipv6.ip6_null_entry;
4574 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4575 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4576 ip6_template_metrics, true);
4578 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4579 net->ipv6.fib6_has_custom_rules = false;
4580 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4581 sizeof(*net->ipv6.ip6_prohibit_entry),
4583 if (!net->ipv6.ip6_prohibit_entry)
4584 goto out_ip6_null_entry;
4585 net->ipv6.ip6_prohibit_entry->dst.path =
4586 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4587 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4588 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4589 ip6_template_metrics, true);
4591 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4592 sizeof(*net->ipv6.ip6_blk_hole_entry),
4594 if (!net->ipv6.ip6_blk_hole_entry)
4595 goto out_ip6_prohibit_entry;
4596 net->ipv6.ip6_blk_hole_entry->dst.path =
4597 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4598 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4599 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4600 ip6_template_metrics, true);
4603 net->ipv6.sysctl.flush_delay = 0;
4604 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4605 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4606 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4607 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4608 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4609 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4610 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4612 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4618 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4619 out_ip6_prohibit_entry:
4620 kfree(net->ipv6.ip6_prohibit_entry);
4622 kfree(net->ipv6.ip6_null_entry);
4624 out_ip6_dst_entries:
4625 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4630 static void __net_exit ip6_route_net_exit(struct net *net)
4632 kfree(net->ipv6.ip6_null_entry);
4633 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4634 kfree(net->ipv6.ip6_prohibit_entry);
4635 kfree(net->ipv6.ip6_blk_hole_entry);
4637 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4640 static int __net_init ip6_route_net_init_late(struct net *net)
4642 #ifdef CONFIG_PROC_FS
4643 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4644 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4649 static void __net_exit ip6_route_net_exit_late(struct net *net)
4651 #ifdef CONFIG_PROC_FS
4652 remove_proc_entry("ipv6_route", net->proc_net);
4653 remove_proc_entry("rt6_stats", net->proc_net);
4657 static struct pernet_operations ip6_route_net_ops = {
4658 .init = ip6_route_net_init,
4659 .exit = ip6_route_net_exit,
4662 static int __net_init ipv6_inetpeer_init(struct net *net)
4664 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4668 inet_peer_base_init(bp);
4669 net->ipv6.peers = bp;
4673 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4675 struct inet_peer_base *bp = net->ipv6.peers;
4677 net->ipv6.peers = NULL;
4678 inetpeer_invalidate_tree(bp);
4682 static struct pernet_operations ipv6_inetpeer_ops = {
4683 .init = ipv6_inetpeer_init,
4684 .exit = ipv6_inetpeer_exit,
4687 static struct pernet_operations ip6_route_net_late_ops = {
4688 .init = ip6_route_net_init_late,
4689 .exit = ip6_route_net_exit_late,
4692 static struct notifier_block ip6_route_dev_notifier = {
4693 .notifier_call = ip6_route_dev_notify,
4694 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4697 void __init ip6_route_init_special_entries(void)
4699 /* Registering of the loopback is done before this portion of code,
4700 * the loopback reference in rt6_info will not be taken, do it
4701 * manually for init_net */
4702 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4703 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4704 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4705 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4706 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4707 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4708 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4712 int __init ip6_route_init(void)
4718 ip6_dst_ops_template.kmem_cachep =
4719 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4720 SLAB_HWCACHE_ALIGN, NULL);
4721 if (!ip6_dst_ops_template.kmem_cachep)
4724 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4726 goto out_kmem_cache;
4728 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4730 goto out_dst_entries;
4732 ret = register_pernet_subsys(&ip6_route_net_ops);
4734 goto out_register_inetpeer;
4736 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4740 goto out_register_subsys;
4746 ret = fib6_rules_init();
4750 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4752 goto fib6_rules_init;
4755 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4756 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4757 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4758 RTNL_FLAG_DOIT_UNLOCKED))
4759 goto out_register_late_subsys;
4761 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4763 goto out_register_late_subsys;
4765 for_each_possible_cpu(cpu) {
4766 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4768 INIT_LIST_HEAD(&ul->head);
4769 spin_lock_init(&ul->lock);
4775 out_register_late_subsys:
4776 unregister_pernet_subsys(&ip6_route_net_late_ops);
4778 fib6_rules_cleanup();
4783 out_register_subsys:
4784 unregister_pernet_subsys(&ip6_route_net_ops);
4785 out_register_inetpeer:
4786 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4788 dst_entries_destroy(&ip6_dst_blackhole_ops);
4790 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4794 void ip6_route_cleanup(void)
4796 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4797 unregister_pernet_subsys(&ip6_route_net_late_ops);
4798 fib6_rules_cleanup();
4801 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4802 unregister_pernet_subsys(&ip6_route_net_ops);
4803 dst_entries_destroy(&ip6_dst_blackhole_ops);
4804 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);