2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
147 spin_lock_bh(&ul->lock);
148 list_del(&rt->rt6i_uncached);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
187 return dst_metrics_write_ptr(rt->dst.from);
190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
192 struct rt6_info *rt = (struct rt6_info *)dst;
194 if (rt->rt6i_flags & RTF_PCPU)
195 return rt6_pcpu_cow_metrics(rt);
196 else if (rt->rt6i_flags & RTF_CACHE)
199 return dst_cow_metrics_generic(dst, old);
202 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
206 struct in6_addr *p = &rt->rt6i_gateway;
208 if (!ipv6_addr_any(p))
209 return (const void *) p;
211 return &ipv6_hdr(skb)->daddr;
215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
219 struct rt6_info *rt = (struct rt6_info *) dst;
222 daddr = choose_neigh_daddr(rt, skb, daddr);
223 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 return neigh_create(&nd_tbl, daddr, dst->dev);
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
231 struct net_device *dev = dst->dev;
232 struct rt6_info *rt = (struct rt6_info *)dst;
234 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
239 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
241 __ipv6_confirm_neigh(dev, daddr);
244 static struct dst_ops ip6_dst_ops_template = {
248 .check = ip6_dst_check,
249 .default_advmss = ip6_default_advmss,
251 .cow_metrics = ipv6_cow_metrics,
252 .destroy = ip6_dst_destroy,
253 .ifdown = ip6_dst_ifdown,
254 .negative_advice = ip6_negative_advice,
255 .link_failure = ip6_link_failure,
256 .update_pmtu = ip6_rt_update_pmtu,
257 .redirect = rt6_do_redirect,
258 .local_out = __ip6_local_out,
259 .neigh_lookup = ip6_neigh_lookup,
260 .confirm_neigh = ip6_confirm_neigh,
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
265 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
267 return mtu ? : dst->dev->mtu;
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271 struct sk_buff *skb, u32 mtu)
275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
280 static struct dst_ops ip6_dst_blackhole_ops = {
282 .destroy = ip6_dst_destroy,
283 .check = ip6_dst_check,
284 .mtu = ip6_blackhole_mtu,
285 .default_advmss = ip6_default_advmss,
286 .update_pmtu = ip6_rt_blackhole_update_pmtu,
287 .redirect = ip6_rt_blackhole_redirect,
288 .cow_metrics = dst_cow_metrics_generic,
289 .neigh_lookup = ip6_neigh_lookup,
292 static const u32 ip6_template_metrics[RTAX_MAX] = {
293 [RTAX_HOPLIMIT - 1] = 0,
296 static const struct rt6_info ip6_null_entry_template = {
298 .__refcnt = ATOMIC_INIT(1),
300 .obsolete = DST_OBSOLETE_FORCE_CHK,
301 .error = -ENETUNREACH,
302 .input = ip6_pkt_discard,
303 .output = ip6_pkt_discard_out,
305 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
306 .rt6i_protocol = RTPROT_KERNEL,
307 .rt6i_metric = ~(u32) 0,
308 .rt6i_ref = ATOMIC_INIT(1),
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313 static const struct rt6_info ip6_prohibit_entry_template = {
315 .__refcnt = ATOMIC_INIT(1),
317 .obsolete = DST_OBSOLETE_FORCE_CHK,
319 .input = ip6_pkt_prohibit,
320 .output = ip6_pkt_prohibit_out,
322 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
323 .rt6i_protocol = RTPROT_KERNEL,
324 .rt6i_metric = ~(u32) 0,
325 .rt6i_ref = ATOMIC_INIT(1),
328 static const struct rt6_info ip6_blk_hole_entry_template = {
330 .__refcnt = ATOMIC_INIT(1),
332 .obsolete = DST_OBSOLETE_FORCE_CHK,
334 .input = dst_discard,
335 .output = dst_discard_out,
337 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
338 .rt6i_protocol = RTPROT_KERNEL,
339 .rt6i_metric = ~(u32) 0,
340 .rt6i_ref = ATOMIC_INIT(1),
345 static void rt6_info_init(struct rt6_info *rt)
347 struct dst_entry *dst = &rt->dst;
349 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
350 INIT_LIST_HEAD(&rt->rt6i_siblings);
351 INIT_LIST_HEAD(&rt->rt6i_uncached);
354 /* allocate dst with ip6_dst_ops */
355 static struct rt6_info *__ip6_dst_alloc(struct net *net,
356 struct net_device *dev,
359 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
360 1, DST_OBSOLETE_FORCE_CHK, flags);
368 struct rt6_info *ip6_dst_alloc(struct net *net,
369 struct net_device *dev,
372 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
375 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
379 for_each_possible_cpu(cpu) {
382 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
383 /* no one shares rt */
387 dst_release_immediate(&rt->dst);
394 EXPORT_SYMBOL(ip6_dst_alloc);
396 static void ip6_dst_destroy(struct dst_entry *dst)
398 struct rt6_info *rt = (struct rt6_info *)dst;
399 struct rt6_exception_bucket *bucket;
400 struct dst_entry *from = dst->from;
401 struct inet6_dev *idev;
403 dst_destroy_metrics_generic(dst);
404 free_percpu(rt->rt6i_pcpu);
405 rt6_uncached_list_del(rt);
407 idev = rt->rt6i_idev;
409 rt->rt6i_idev = NULL;
412 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
414 rt->rt6i_exception_bucket = NULL;
422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
425 struct rt6_info *rt = (struct rt6_info *)dst;
426 struct inet6_dev *idev = rt->rt6i_idev;
427 struct net_device *loopback_dev =
428 dev_net(dev)->loopback_dev;
430 if (idev && idev->dev != loopback_dev) {
431 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
433 rt->rt6i_idev = loopback_idev;
439 static bool __rt6_check_expired(const struct rt6_info *rt)
441 if (rt->rt6i_flags & RTF_EXPIRES)
442 return time_after(jiffies, rt->dst.expires);
447 static bool rt6_check_expired(const struct rt6_info *rt)
449 if (rt->rt6i_flags & RTF_EXPIRES) {
450 if (time_after(jiffies, rt->dst.expires))
452 } else if (rt->dst.from) {
453 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
454 rt6_check_expired((struct rt6_info *)rt->dst.from);
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460 struct flowi6 *fl6, int oif,
463 struct rt6_info *sibling, *next_sibling;
466 /* We might have already computed the hash for ICMPv6 errors. In such
467 * case it will always be non-zero. Otherwise now is the time to do it.
470 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
472 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
473 /* Don't change the route, if route_choosen == 0
474 * (siblings does not include ourself)
477 list_for_each_entry_safe(sibling, next_sibling,
478 &match->rt6i_siblings, rt6i_siblings) {
480 if (route_choosen == 0) {
481 if (rt6_score_route(sibling, oif, strict) < 0)
491 * Route lookup. Any table->tb6_lock is implied.
494 static inline struct rt6_info *rt6_device_match(struct net *net,
496 const struct in6_addr *saddr,
500 struct rt6_info *local = NULL;
501 struct rt6_info *sprt;
503 if (!oif && ipv6_addr_any(saddr))
506 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
507 struct net_device *dev = sprt->dst.dev;
510 if (dev->ifindex == oif)
512 if (dev->flags & IFF_LOOPBACK) {
513 if (!sprt->rt6i_idev ||
514 sprt->rt6i_idev->dev->ifindex != oif) {
515 if (flags & RT6_LOOKUP_F_IFACE)
518 local->rt6i_idev->dev->ifindex == oif)
524 if (ipv6_chk_addr(net, saddr, dev,
525 flags & RT6_LOOKUP_F_IFACE))
534 if (flags & RT6_LOOKUP_F_IFACE)
535 return net->ipv6.ip6_null_entry;
541 #ifdef CONFIG_IPV6_ROUTER_PREF
542 struct __rt6_probe_work {
543 struct work_struct work;
544 struct in6_addr target;
545 struct net_device *dev;
548 static void rt6_probe_deferred(struct work_struct *w)
550 struct in6_addr mcaddr;
551 struct __rt6_probe_work *work =
552 container_of(w, struct __rt6_probe_work, work);
554 addrconf_addr_solict_mult(&work->target, &mcaddr);
555 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560 static void rt6_probe(struct rt6_info *rt)
562 struct __rt6_probe_work *work;
563 struct neighbour *neigh;
565 * Okay, this does not seem to be appropriate
566 * for now, however, we need to check if it
567 * is really so; aka Router Reachability Probing.
569 * Router Reachability Probe MUST be rate-limited
570 * to no more than one per minute.
572 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
575 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
577 if (neigh->nud_state & NUD_VALID)
581 write_lock(&neigh->lock);
582 if (!(neigh->nud_state & NUD_VALID) &&
585 rt->rt6i_idev->cnf.rtr_probe_interval)) {
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 __neigh_set_probe_once(neigh);
590 write_unlock(&neigh->lock);
592 work = kmalloc(sizeof(*work), GFP_ATOMIC);
596 INIT_WORK(&work->work, rt6_probe_deferred);
597 work->target = rt->rt6i_gateway;
598 dev_hold(rt->dst.dev);
599 work->dev = rt->dst.dev;
600 schedule_work(&work->work);
604 rcu_read_unlock_bh();
607 static inline void rt6_probe(struct rt6_info *rt)
613 * Default Router Selection (RFC 2461 6.3.6)
615 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
617 struct net_device *dev = rt->dst.dev;
618 if (!oif || dev->ifindex == oif)
620 if ((dev->flags & IFF_LOOPBACK) &&
621 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
628 struct neighbour *neigh;
629 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
631 if (rt->rt6i_flags & RTF_NONEXTHOP ||
632 !(rt->rt6i_flags & RTF_GATEWAY))
633 return RT6_NUD_SUCCEED;
636 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
638 read_lock(&neigh->lock);
639 if (neigh->nud_state & NUD_VALID)
640 ret = RT6_NUD_SUCCEED;
641 #ifdef CONFIG_IPV6_ROUTER_PREF
642 else if (!(neigh->nud_state & NUD_FAILED))
643 ret = RT6_NUD_SUCCEED;
645 ret = RT6_NUD_FAIL_PROBE;
647 read_unlock(&neigh->lock);
649 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
650 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
652 rcu_read_unlock_bh();
657 static int rt6_score_route(struct rt6_info *rt, int oif,
662 m = rt6_check_dev(rt, oif);
663 if (!m && (strict & RT6_LOOKUP_F_IFACE))
664 return RT6_NUD_FAIL_HARD;
665 #ifdef CONFIG_IPV6_ROUTER_PREF
666 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
668 if (strict & RT6_LOOKUP_F_REACHABLE) {
669 int n = rt6_check_neigh(rt);
676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
677 int *mpri, struct rt6_info *match,
681 bool match_do_rr = false;
682 struct inet6_dev *idev = rt->rt6i_idev;
683 struct net_device *dev = rt->dst.dev;
685 if (dev && !netif_carrier_ok(dev) &&
686 idev->cnf.ignore_routes_with_linkdown &&
687 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
690 if (rt6_check_expired(rt))
693 m = rt6_score_route(rt, oif, strict);
694 if (m == RT6_NUD_FAIL_DO_RR) {
696 m = 0; /* lowest valid score */
697 } else if (m == RT6_NUD_FAIL_HARD) {
701 if (strict & RT6_LOOKUP_F_REACHABLE)
704 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
706 *do_rr = match_do_rr;
714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
715 struct rt6_info *rr_head,
716 u32 metric, int oif, int strict,
719 struct rt6_info *rt, *match, *cont;
724 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
725 if (rt->rt6i_metric != metric) {
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
734 if (rt->rt6i_metric != metric) {
739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 for (rt = cont; rt; rt = rt->dst.rt6_next)
746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
753 struct rt6_info *match, *rt0;
759 fn->rr_ptr = rt0 = fn->leaf;
761 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
765 struct rt6_info *next = rt0->dst.rt6_next;
767 /* no entries matched; do round-robin */
768 if (!next || next->rt6i_metric != rt0->rt6i_metric)
775 net = dev_net(rt0->dst.dev);
776 return match ? match : net->ipv6.ip6_null_entry;
779 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
781 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
784 #ifdef CONFIG_IPV6_ROUTE_INFO
785 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
786 const struct in6_addr *gwaddr)
788 struct net *net = dev_net(dev);
789 struct route_info *rinfo = (struct route_info *) opt;
790 struct in6_addr prefix_buf, *prefix;
792 unsigned long lifetime;
795 if (len < sizeof(struct route_info)) {
799 /* Sanity check for prefix_len and length */
800 if (rinfo->length > 3) {
802 } else if (rinfo->prefix_len > 128) {
804 } else if (rinfo->prefix_len > 64) {
805 if (rinfo->length < 2) {
808 } else if (rinfo->prefix_len > 0) {
809 if (rinfo->length < 1) {
814 pref = rinfo->route_pref;
815 if (pref == ICMPV6_ROUTER_PREF_INVALID)
818 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
820 if (rinfo->length == 3)
821 prefix = (struct in6_addr *)rinfo->prefix;
823 /* this function is safe */
824 ipv6_addr_prefix(&prefix_buf,
825 (struct in6_addr *)rinfo->prefix,
827 prefix = &prefix_buf;
830 if (rinfo->prefix_len == 0)
831 rt = rt6_get_dflt_router(gwaddr, dev);
833 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
836 if (rt && !lifetime) {
842 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
845 rt->rt6i_flags = RTF_ROUTEINFO |
846 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
849 if (!addrconf_finite_timeout(lifetime))
850 rt6_clean_expires(rt);
852 rt6_set_expires(rt, jiffies + HZ * lifetime);
860 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
861 struct in6_addr *saddr)
863 struct fib6_node *pn;
865 if (fn->fn_flags & RTN_TL_ROOT)
868 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
869 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
872 if (fn->fn_flags & RTN_RTINFO)
877 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
878 struct fib6_table *table,
879 struct flowi6 *fl6, int flags)
881 struct rt6_info *rt, *rt_cache;
882 struct fib6_node *fn;
884 read_lock_bh(&table->tb6_lock);
885 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
888 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
889 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
890 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
891 if (rt == net->ipv6.ip6_null_entry) {
892 fn = fib6_backtrack(fn, &fl6->saddr);
896 /* Search through exception table */
897 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
901 dst_use(&rt->dst, jiffies);
902 read_unlock_bh(&table->tb6_lock);
904 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
910 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
913 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
915 EXPORT_SYMBOL_GPL(ip6_route_lookup);
917 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
918 const struct in6_addr *saddr, int oif, int strict)
920 struct flowi6 fl6 = {
924 struct dst_entry *dst;
925 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
928 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
929 flags |= RT6_LOOKUP_F_HAS_SADDR;
932 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
934 return (struct rt6_info *) dst;
940 EXPORT_SYMBOL(rt6_lookup);
942 /* ip6_ins_rt is called with FREE table->tb6_lock.
943 * It takes new route entry, the addition fails by any reason the
945 * Caller must hold dst before calling it.
948 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
949 struct mx6_config *mxc,
950 struct netlink_ext_ack *extack)
953 struct fib6_table *table;
955 table = rt->rt6i_table;
956 write_lock_bh(&table->tb6_lock);
957 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
958 write_unlock_bh(&table->tb6_lock);
963 int ip6_ins_rt(struct rt6_info *rt)
965 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
966 struct mx6_config mxc = { .mx = NULL, };
968 /* Hold dst to account for the reference from the fib6 tree */
970 return __ip6_ins_rt(rt, &info, &mxc, NULL);
973 /* called with rcu_lock held */
974 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
976 struct net_device *dev = rt->dst.dev;
978 if (rt->rt6i_flags & RTF_LOCAL) {
979 /* for copies of local routes, dst->dev needs to be the
980 * device if it is a master device, the master device if
981 * device is enslaved, and the loopback as the default
983 if (netif_is_l3_slave(dev) &&
984 !rt6_need_strict(&rt->rt6i_dst.addr))
985 dev = l3mdev_master_dev_rcu(dev);
986 else if (!netif_is_l3_master(dev))
987 dev = dev_net(dev)->loopback_dev;
988 /* last case is netif_is_l3_master(dev) is true in which
989 * case we want dev returned to be dev
996 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
997 const struct in6_addr *daddr,
998 const struct in6_addr *saddr)
1000 struct net_device *dev;
1001 struct rt6_info *rt;
1007 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1008 ort = (struct rt6_info *)ort->dst.from;
1011 dev = ip6_rt_get_dev_rcu(ort);
1012 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1017 ip6_rt_copy_init(rt, ort);
1018 rt->rt6i_flags |= RTF_CACHE;
1019 rt->rt6i_metric = 0;
1020 rt->dst.flags |= DST_HOST;
1021 rt->rt6i_dst.addr = *daddr;
1022 rt->rt6i_dst.plen = 128;
1024 if (!rt6_is_gw_or_nonexthop(ort)) {
1025 if (ort->rt6i_dst.plen != 128 &&
1026 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1027 rt->rt6i_flags |= RTF_ANYCAST;
1028 #ifdef CONFIG_IPV6_SUBTREES
1029 if (rt->rt6i_src.plen && saddr) {
1030 rt->rt6i_src.addr = *saddr;
1031 rt->rt6i_src.plen = 128;
1039 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1041 struct net_device *dev;
1042 struct rt6_info *pcpu_rt;
1045 dev = ip6_rt_get_dev_rcu(rt);
1046 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1050 ip6_rt_copy_init(pcpu_rt, rt);
1051 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1052 pcpu_rt->rt6i_flags |= RTF_PCPU;
1056 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1057 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1059 struct rt6_info *pcpu_rt, **p;
1061 p = this_cpu_ptr(rt->rt6i_pcpu);
1065 dst_hold(&pcpu_rt->dst);
1066 rt6_dst_from_metrics_check(pcpu_rt);
1071 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1073 struct rt6_info *pcpu_rt, *prev, **p;
1075 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1077 struct net *net = dev_net(rt->dst.dev);
1079 dst_hold(&net->ipv6.ip6_null_entry->dst);
1080 return net->ipv6.ip6_null_entry;
1083 dst_hold(&pcpu_rt->dst);
1084 p = this_cpu_ptr(rt->rt6i_pcpu);
1085 prev = cmpxchg(p, NULL, pcpu_rt);
1087 /* If someone did it before us, return prev instead */
1088 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1089 dst_release_immediate(&pcpu_rt->dst);
1090 /* release refcnt taken by above dst_hold() */
1091 dst_release_immediate(&pcpu_rt->dst);
1092 dst_hold(&prev->dst);
1096 rt6_dst_from_metrics_check(pcpu_rt);
1100 /* exception hash table implementation
1102 static DEFINE_SPINLOCK(rt6_exception_lock);
1104 /* Remove rt6_ex from hash table and free the memory
1105 * Caller must hold rt6_exception_lock
1107 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1108 struct rt6_exception *rt6_ex)
1110 if (!bucket || !rt6_ex)
1112 rt6_ex->rt6i->rt6i_node = NULL;
1113 hlist_del_rcu(&rt6_ex->hlist);
1114 rt6_release(rt6_ex->rt6i);
1115 kfree_rcu(rt6_ex, rcu);
1116 WARN_ON_ONCE(!bucket->depth);
1120 /* Remove oldest rt6_ex in bucket and free the memory
1121 * Caller must hold rt6_exception_lock
1123 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1125 struct rt6_exception *rt6_ex, *oldest = NULL;
1130 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1131 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1134 rt6_remove_exception(bucket, oldest);
1137 static u32 rt6_exception_hash(const struct in6_addr *dst,
1138 const struct in6_addr *src)
1140 static u32 seed __read_mostly;
1143 net_get_random_once(&seed, sizeof(seed));
1144 val = jhash(dst, sizeof(*dst), seed);
1146 #ifdef CONFIG_IPV6_SUBTREES
1148 val = jhash(src, sizeof(*src), val);
1150 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1153 /* Helper function to find the cached rt in the hash table
1154 * and update bucket pointer to point to the bucket for this
1155 * (daddr, saddr) pair
1156 * Caller must hold rt6_exception_lock
1158 static struct rt6_exception *
1159 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1160 const struct in6_addr *daddr,
1161 const struct in6_addr *saddr)
1163 struct rt6_exception *rt6_ex;
1166 if (!(*bucket) || !daddr)
1169 hval = rt6_exception_hash(daddr, saddr);
1172 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1173 struct rt6_info *rt6 = rt6_ex->rt6i;
1174 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1176 #ifdef CONFIG_IPV6_SUBTREES
1177 if (matched && saddr)
1178 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1186 /* Helper function to find the cached rt in the hash table
1187 * and update bucket pointer to point to the bucket for this
1188 * (daddr, saddr) pair
1189 * Caller must hold rcu_read_lock()
1191 static struct rt6_exception *
1192 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1193 const struct in6_addr *daddr,
1194 const struct in6_addr *saddr)
1196 struct rt6_exception *rt6_ex;
1199 WARN_ON_ONCE(!rcu_read_lock_held());
1201 if (!(*bucket) || !daddr)
1204 hval = rt6_exception_hash(daddr, saddr);
1207 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1208 struct rt6_info *rt6 = rt6_ex->rt6i;
1209 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1211 #ifdef CONFIG_IPV6_SUBTREES
1212 if (matched && saddr)
1213 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 static int rt6_insert_exception(struct rt6_info *nrt,
1222 struct rt6_info *ort)
1224 struct rt6_exception_bucket *bucket;
1225 struct in6_addr *src_key = NULL;
1226 struct rt6_exception *rt6_ex;
1229 /* ort can't be a cache or pcpu route */
1230 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1231 ort = (struct rt6_info *)ort->dst.from;
1232 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1234 spin_lock_bh(&rt6_exception_lock);
1236 if (ort->exception_bucket_flushed) {
1241 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1242 lockdep_is_held(&rt6_exception_lock));
1244 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1250 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1253 #ifdef CONFIG_IPV6_SUBTREES
1254 /* rt6i_src.plen != 0 indicates ort is in subtree
1255 * and exception table is indexed by a hash of
1256 * both rt6i_dst and rt6i_src.
1257 * Otherwise, the exception table is indexed by
1258 * a hash of only rt6i_dst.
1260 if (ort->rt6i_src.plen)
1261 src_key = &nrt->rt6i_src.addr;
1264 /* Update rt6i_prefsrc as it could be changed
1265 * in rt6_remove_prefsrc()
1267 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1268 /* rt6_mtu_change() might lower mtu on ort.
1269 * Only insert this exception route if its mtu
1270 * is less than ort's mtu value.
1272 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1277 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1280 rt6_remove_exception(bucket, rt6_ex);
1282 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1288 rt6_ex->stamp = jiffies;
1289 atomic_inc(&nrt->rt6i_ref);
1290 nrt->rt6i_node = ort->rt6i_node;
1291 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1294 if (bucket->depth > FIB6_MAX_DEPTH)
1295 rt6_exception_remove_oldest(bucket);
1298 spin_unlock_bh(&rt6_exception_lock);
1300 /* Update fn->fn_sernum to invalidate all cached dst */
1302 fib6_update_sernum(ort);
1307 void rt6_flush_exceptions(struct rt6_info *rt)
1309 struct rt6_exception_bucket *bucket;
1310 struct rt6_exception *rt6_ex;
1311 struct hlist_node *tmp;
1314 spin_lock_bh(&rt6_exception_lock);
1315 /* Prevent rt6_insert_exception() to recreate the bucket list */
1316 rt->exception_bucket_flushed = 1;
1318 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1319 lockdep_is_held(&rt6_exception_lock));
1323 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1324 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1325 rt6_remove_exception(bucket, rt6_ex);
1326 WARN_ON_ONCE(bucket->depth);
1331 spin_unlock_bh(&rt6_exception_lock);
1334 /* Find cached rt in the hash table inside passed in rt
1335 * Caller has to hold rcu_read_lock()
1337 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1338 struct in6_addr *daddr,
1339 struct in6_addr *saddr)
1341 struct rt6_exception_bucket *bucket;
1342 struct in6_addr *src_key = NULL;
1343 struct rt6_exception *rt6_ex;
1344 struct rt6_info *res = NULL;
1346 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1348 #ifdef CONFIG_IPV6_SUBTREES
1349 /* rt6i_src.plen != 0 indicates rt is in subtree
1350 * and exception table is indexed by a hash of
1351 * both rt6i_dst and rt6i_src.
1352 * Otherwise, the exception table is indexed by
1353 * a hash of only rt6i_dst.
1355 if (rt->rt6i_src.plen)
1358 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1360 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1366 /* Remove the passed in cached rt from the hash table that contains it */
1367 int rt6_remove_exception_rt(struct rt6_info *rt)
1369 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1370 struct rt6_exception_bucket *bucket;
1371 struct in6_addr *src_key = NULL;
1372 struct rt6_exception *rt6_ex;
1376 !(rt->rt6i_flags | RTF_CACHE))
1379 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1382 spin_lock_bh(&rt6_exception_lock);
1383 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1384 lockdep_is_held(&rt6_exception_lock));
1385 #ifdef CONFIG_IPV6_SUBTREES
1386 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1387 * and exception table is indexed by a hash of
1388 * both rt6i_dst and rt6i_src.
1389 * Otherwise, the exception table is indexed by
1390 * a hash of only rt6i_dst.
1392 if (from->rt6i_src.plen)
1393 src_key = &rt->rt6i_src.addr;
1395 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1399 rt6_remove_exception(bucket, rt6_ex);
1405 spin_unlock_bh(&rt6_exception_lock);
1409 /* Find rt6_ex which contains the passed in rt cache and
1412 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1414 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1415 struct rt6_exception_bucket *bucket;
1416 struct in6_addr *src_key = NULL;
1417 struct rt6_exception *rt6_ex;
1420 !(rt->rt6i_flags | RTF_CACHE))
1424 bucket = rcu_dereference(from->rt6i_exception_bucket);
1426 #ifdef CONFIG_IPV6_SUBTREES
1427 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1428 * and exception table is indexed by a hash of
1429 * both rt6i_dst and rt6i_src.
1430 * Otherwise, the exception table is indexed by
1431 * a hash of only rt6i_dst.
1433 if (from->rt6i_src.plen)
1434 src_key = &rt->rt6i_src.addr;
1436 rt6_ex = __rt6_find_exception_rcu(&bucket,
1440 rt6_ex->stamp = jiffies;
1445 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1447 struct rt6_exception_bucket *bucket;
1448 struct rt6_exception *rt6_ex;
1451 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1452 lockdep_is_held(&rt6_exception_lock));
1455 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1456 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1457 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1464 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1466 struct rt6_exception_bucket *bucket;
1467 struct rt6_exception *rt6_ex;
1470 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1471 lockdep_is_held(&rt6_exception_lock));
1474 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1475 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1476 struct rt6_info *entry = rt6_ex->rt6i;
1477 /* For RTF_CACHE with rt6i_pmtu == 0
1478 * (i.e. a redirected route),
1479 * the metrics of its rt->dst.from has already
1482 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1483 entry->rt6i_pmtu = mtu;
1490 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1492 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1493 struct in6_addr *gateway)
1495 struct rt6_exception_bucket *bucket;
1496 struct rt6_exception *rt6_ex;
1497 struct hlist_node *tmp;
1500 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1503 spin_lock_bh(&rt6_exception_lock);
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1508 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509 hlist_for_each_entry_safe(rt6_ex, tmp,
1510 &bucket->chain, hlist) {
1511 struct rt6_info *entry = rt6_ex->rt6i;
1513 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1514 RTF_CACHE_GATEWAY &&
1515 ipv6_addr_equal(gateway,
1516 &entry->rt6i_gateway)) {
1517 rt6_remove_exception(bucket, rt6_ex);
1524 spin_unlock_bh(&rt6_exception_lock);
1527 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1528 struct rt6_exception *rt6_ex,
1529 struct fib6_gc_args *gc_args,
1532 struct rt6_info *rt = rt6_ex->rt6i;
1534 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1535 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1536 RT6_TRACE("aging clone %p\n", rt);
1537 rt6_remove_exception(bucket, rt6_ex);
1539 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1540 struct neighbour *neigh;
1541 __u8 neigh_flags = 0;
1543 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1545 neigh_flags = neigh->flags;
1546 neigh_release(neigh);
1548 if (!(neigh_flags & NTF_ROUTER)) {
1549 RT6_TRACE("purging route %p via non-router but gateway\n",
1551 rt6_remove_exception(bucket, rt6_ex);
1558 void rt6_age_exceptions(struct rt6_info *rt,
1559 struct fib6_gc_args *gc_args,
1562 struct rt6_exception_bucket *bucket;
1563 struct rt6_exception *rt6_ex;
1564 struct hlist_node *tmp;
1567 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1570 spin_lock_bh(&rt6_exception_lock);
1571 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1572 lockdep_is_held(&rt6_exception_lock));
1575 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1576 hlist_for_each_entry_safe(rt6_ex, tmp,
1577 &bucket->chain, hlist) {
1578 rt6_age_examine_exception(bucket, rt6_ex,
1584 spin_unlock_bh(&rt6_exception_lock);
1587 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1588 int oif, struct flowi6 *fl6, int flags)
1590 struct fib6_node *fn, *saved_fn;
1591 struct rt6_info *rt, *rt_cache;
1594 strict |= flags & RT6_LOOKUP_F_IFACE;
1595 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1596 if (net->ipv6.devconf_all->forwarding == 0)
1597 strict |= RT6_LOOKUP_F_REACHABLE;
1599 read_lock_bh(&table->tb6_lock);
1601 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1604 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1608 rt = rt6_select(fn, oif, strict);
1609 if (rt->rt6i_nsiblings)
1610 rt = rt6_multipath_select(rt, fl6, oif, strict);
1611 if (rt == net->ipv6.ip6_null_entry) {
1612 fn = fib6_backtrack(fn, &fl6->saddr);
1614 goto redo_rt6_select;
1615 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1616 /* also consider unreachable route */
1617 strict &= ~RT6_LOOKUP_F_REACHABLE;
1619 goto redo_rt6_select;
1623 /*Search through exception table */
1624 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1628 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1629 dst_use(&rt->dst, jiffies);
1630 read_unlock_bh(&table->tb6_lock);
1632 rt6_dst_from_metrics_check(rt);
1634 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1636 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1637 !(rt->rt6i_flags & RTF_GATEWAY))) {
1638 /* Create a RTF_CACHE clone which will not be
1639 * owned by the fib6 tree. It is for the special case where
1640 * the daddr in the skb during the neighbor look-up is different
1641 * from the fl6->daddr used to look-up route here.
1644 struct rt6_info *uncached_rt;
1646 dst_use(&rt->dst, jiffies);
1647 read_unlock_bh(&table->tb6_lock);
1649 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1650 dst_release(&rt->dst);
1653 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1654 * No need for another dst_hold()
1656 rt6_uncached_list_add(uncached_rt);
1658 uncached_rt = net->ipv6.ip6_null_entry;
1659 dst_hold(&uncached_rt->dst);
1662 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1666 /* Get a percpu copy */
1668 struct rt6_info *pcpu_rt;
1670 rt->dst.lastuse = jiffies;
1672 pcpu_rt = rt6_get_pcpu_route(rt);
1675 read_unlock_bh(&table->tb6_lock);
1677 /* atomic_inc_not_zero() is needed when using rcu */
1678 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1679 /* We have to do the read_unlock first
1680 * because rt6_make_pcpu_route() may trigger
1681 * ip6_dst_gc() which will take the write_lock.
1683 * No dst_hold() on rt is needed because grabbing
1684 * rt->rt6i_ref makes sure rt can't be released.
1686 read_unlock_bh(&table->tb6_lock);
1687 pcpu_rt = rt6_make_pcpu_route(rt);
1690 /* rt is already removed from tree */
1691 read_unlock_bh(&table->tb6_lock);
1692 pcpu_rt = net->ipv6.ip6_null_entry;
1693 dst_hold(&pcpu_rt->dst);
1697 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1701 EXPORT_SYMBOL_GPL(ip6_pol_route);
1703 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1704 struct flowi6 *fl6, int flags)
1706 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1709 struct dst_entry *ip6_route_input_lookup(struct net *net,
1710 struct net_device *dev,
1711 struct flowi6 *fl6, int flags)
1713 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1714 flags |= RT6_LOOKUP_F_IFACE;
1716 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1718 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1720 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1721 struct flow_keys *keys)
1723 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1724 const struct ipv6hdr *key_iph = outer_iph;
1725 const struct ipv6hdr *inner_iph;
1726 const struct icmp6hdr *icmph;
1727 struct ipv6hdr _inner_iph;
1729 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1732 icmph = icmp6_hdr(skb);
1733 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1734 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1735 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1736 icmph->icmp6_type != ICMPV6_PARAMPROB)
1739 inner_iph = skb_header_pointer(skb,
1740 skb_transport_offset(skb) + sizeof(*icmph),
1741 sizeof(_inner_iph), &_inner_iph);
1745 key_iph = inner_iph;
1747 memset(keys, 0, sizeof(*keys));
1748 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1749 keys->addrs.v6addrs.src = key_iph->saddr;
1750 keys->addrs.v6addrs.dst = key_iph->daddr;
1751 keys->tags.flow_label = ip6_flowinfo(key_iph);
1752 keys->basic.ip_proto = key_iph->nexthdr;
1755 /* if skb is set it will be used and fl6 can be NULL */
1756 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1758 struct flow_keys hash_keys;
1761 ip6_multipath_l3_keys(skb, &hash_keys);
1762 return flow_hash_from_keys(&hash_keys);
1765 return get_hash_from_flowi6(fl6);
1768 void ip6_route_input(struct sk_buff *skb)
1770 const struct ipv6hdr *iph = ipv6_hdr(skb);
1771 struct net *net = dev_net(skb->dev);
1772 int flags = RT6_LOOKUP_F_HAS_SADDR;
1773 struct ip_tunnel_info *tun_info;
1774 struct flowi6 fl6 = {
1775 .flowi6_iif = skb->dev->ifindex,
1776 .daddr = iph->daddr,
1777 .saddr = iph->saddr,
1778 .flowlabel = ip6_flowinfo(iph),
1779 .flowi6_mark = skb->mark,
1780 .flowi6_proto = iph->nexthdr,
1783 tun_info = skb_tunnel_info(skb);
1784 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1785 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1786 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1787 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1789 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1792 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1793 struct flowi6 *fl6, int flags)
1795 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1798 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1799 struct flowi6 *fl6, int flags)
1803 if (rt6_need_strict(&fl6->daddr)) {
1804 struct dst_entry *dst;
1806 dst = l3mdev_link_scope_lookup(net, fl6);
1811 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1813 any_src = ipv6_addr_any(&fl6->saddr);
1814 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1815 (fl6->flowi6_oif && any_src))
1816 flags |= RT6_LOOKUP_F_IFACE;
1819 flags |= RT6_LOOKUP_F_HAS_SADDR;
1821 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1823 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1825 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1827 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1829 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1830 struct net_device *loopback_dev = net->loopback_dev;
1831 struct dst_entry *new = NULL;
1833 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1834 DST_OBSOLETE_NONE, 0);
1840 new->input = dst_discard;
1841 new->output = dst_discard_out;
1843 dst_copy_metrics(new, &ort->dst);
1845 rt->rt6i_idev = in6_dev_get(loopback_dev);
1846 rt->rt6i_gateway = ort->rt6i_gateway;
1847 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1848 rt->rt6i_metric = 0;
1850 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1851 #ifdef CONFIG_IPV6_SUBTREES
1852 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1856 dst_release(dst_orig);
1857 return new ? new : ERR_PTR(-ENOMEM);
1861 * Destination cache support functions
1864 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1867 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1868 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1871 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1875 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1878 if (rt6_check_expired(rt))
1884 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1886 if (!__rt6_check_expired(rt) &&
1887 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1888 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1894 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1896 struct rt6_info *rt;
1898 rt = (struct rt6_info *) dst;
1900 /* All IPV6 dsts are created with ->obsolete set to the value
1901 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1902 * into this function always.
1905 rt6_dst_from_metrics_check(rt);
1907 if (rt->rt6i_flags & RTF_PCPU ||
1908 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1909 return rt6_dst_from_check(rt, cookie);
1911 return rt6_check(rt, cookie);
1914 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1916 struct rt6_info *rt = (struct rt6_info *) dst;
1919 if (rt->rt6i_flags & RTF_CACHE) {
1920 if (rt6_check_expired(rt)) {
1932 static void ip6_link_failure(struct sk_buff *skb)
1934 struct rt6_info *rt;
1936 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1938 rt = (struct rt6_info *) skb_dst(skb);
1940 if (rt->rt6i_flags & RTF_CACHE) {
1941 if (dst_hold_safe(&rt->dst))
1944 struct fib6_node *fn;
1947 fn = rcu_dereference(rt->rt6i_node);
1948 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1955 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1957 struct net *net = dev_net(rt->dst.dev);
1959 rt->rt6i_flags |= RTF_MODIFIED;
1960 rt->rt6i_pmtu = mtu;
1961 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1964 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1966 return !(rt->rt6i_flags & RTF_CACHE) &&
1967 (rt->rt6i_flags & RTF_PCPU ||
1968 rcu_access_pointer(rt->rt6i_node));
1971 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1972 const struct ipv6hdr *iph, u32 mtu)
1974 const struct in6_addr *daddr, *saddr;
1975 struct rt6_info *rt6 = (struct rt6_info *)dst;
1977 if (rt6->rt6i_flags & RTF_LOCAL)
1980 if (dst_metric_locked(dst, RTAX_MTU))
1984 daddr = &iph->daddr;
1985 saddr = &iph->saddr;
1987 daddr = &sk->sk_v6_daddr;
1988 saddr = &inet6_sk(sk)->saddr;
1993 dst_confirm_neigh(dst, daddr);
1994 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1995 if (mtu >= dst_mtu(dst))
1998 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1999 rt6_do_update_pmtu(rt6, mtu);
2000 /* update rt6_ex->stamp for cache */
2001 if (rt6->rt6i_flags & RTF_CACHE)
2002 rt6_update_exception_stamp_rt(rt6);
2004 struct rt6_info *nrt6;
2006 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2008 rt6_do_update_pmtu(nrt6, mtu);
2009 if (rt6_insert_exception(nrt6, rt6))
2010 dst_release_immediate(&nrt6->dst);
2015 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2016 struct sk_buff *skb, u32 mtu)
2018 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2021 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2022 int oif, u32 mark, kuid_t uid)
2024 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2025 struct dst_entry *dst;
2028 memset(&fl6, 0, sizeof(fl6));
2029 fl6.flowi6_oif = oif;
2030 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2031 fl6.daddr = iph->daddr;
2032 fl6.saddr = iph->saddr;
2033 fl6.flowlabel = ip6_flowinfo(iph);
2034 fl6.flowi6_uid = uid;
2036 dst = ip6_route_output(net, NULL, &fl6);
2038 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2041 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2043 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2045 struct dst_entry *dst;
2047 ip6_update_pmtu(skb, sock_net(sk), mtu,
2048 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2050 dst = __sk_dst_get(sk);
2051 if (!dst || !dst->obsolete ||
2052 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2056 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2057 ip6_datagram_dst_update(sk, false);
2060 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2062 /* Handle redirects */
2063 struct ip6rd_flowi {
2065 struct in6_addr gateway;
2068 static struct rt6_info *__ip6_route_redirect(struct net *net,
2069 struct fib6_table *table,
2073 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2074 struct rt6_info *rt, *rt_cache;
2075 struct fib6_node *fn;
2077 /* Get the "current" route for this destination and
2078 * check if the redirect has come from appropriate router.
2080 * RFC 4861 specifies that redirects should only be
2081 * accepted if they come from the nexthop to the target.
2082 * Due to the way the routes are chosen, this notion
2083 * is a bit fuzzy and one might need to check all possible
2087 read_lock_bh(&table->tb6_lock);
2088 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2090 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2091 if (rt6_check_expired(rt))
2095 if (!(rt->rt6i_flags & RTF_GATEWAY))
2097 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2099 /* rt_cache's gateway might be different from its 'parent'
2100 * in the case of an ip redirect.
2101 * So we keep searching in the exception table if the gateway
2104 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2105 rt_cache = rt6_find_cached_rt(rt,
2109 ipv6_addr_equal(&rdfl->gateway,
2110 &rt_cache->rt6i_gateway)) {
2120 rt = net->ipv6.ip6_null_entry;
2121 else if (rt->dst.error) {
2122 rt = net->ipv6.ip6_null_entry;
2126 if (rt == net->ipv6.ip6_null_entry) {
2127 fn = fib6_backtrack(fn, &fl6->saddr);
2135 read_unlock_bh(&table->tb6_lock);
2137 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2141 static struct dst_entry *ip6_route_redirect(struct net *net,
2142 const struct flowi6 *fl6,
2143 const struct in6_addr *gateway)
2145 int flags = RT6_LOOKUP_F_HAS_SADDR;
2146 struct ip6rd_flowi rdfl;
2149 rdfl.gateway = *gateway;
2151 return fib6_rule_lookup(net, &rdfl.fl6,
2152 flags, __ip6_route_redirect);
2155 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2158 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2159 struct dst_entry *dst;
2162 memset(&fl6, 0, sizeof(fl6));
2163 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2164 fl6.flowi6_oif = oif;
2165 fl6.flowi6_mark = mark;
2166 fl6.daddr = iph->daddr;
2167 fl6.saddr = iph->saddr;
2168 fl6.flowlabel = ip6_flowinfo(iph);
2169 fl6.flowi6_uid = uid;
2171 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2172 rt6_do_redirect(dst, NULL, skb);
2175 EXPORT_SYMBOL_GPL(ip6_redirect);
2177 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2180 const struct ipv6hdr *iph = ipv6_hdr(skb);
2181 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2182 struct dst_entry *dst;
2185 memset(&fl6, 0, sizeof(fl6));
2186 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2187 fl6.flowi6_oif = oif;
2188 fl6.flowi6_mark = mark;
2189 fl6.daddr = msg->dest;
2190 fl6.saddr = iph->daddr;
2191 fl6.flowi6_uid = sock_net_uid(net, NULL);
2193 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2194 rt6_do_redirect(dst, NULL, skb);
2198 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2200 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2203 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2205 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2207 struct net_device *dev = dst->dev;
2208 unsigned int mtu = dst_mtu(dst);
2209 struct net *net = dev_net(dev);
2211 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2213 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2214 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2217 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2218 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2219 * IPV6_MAXPLEN is also valid and means: "any MSS,
2220 * rely only on pmtu discovery"
2222 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2227 static unsigned int ip6_mtu(const struct dst_entry *dst)
2229 const struct rt6_info *rt = (const struct rt6_info *)dst;
2230 unsigned int mtu = rt->rt6i_pmtu;
2231 struct inet6_dev *idev;
2236 mtu = dst_metric_raw(dst, RTAX_MTU);
2243 idev = __in6_dev_get(dst->dev);
2245 mtu = idev->cnf.mtu6;
2249 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2251 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2254 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2257 struct dst_entry *dst;
2258 struct rt6_info *rt;
2259 struct inet6_dev *idev = in6_dev_get(dev);
2260 struct net *net = dev_net(dev);
2262 if (unlikely(!idev))
2263 return ERR_PTR(-ENODEV);
2265 rt = ip6_dst_alloc(net, dev, 0);
2266 if (unlikely(!rt)) {
2268 dst = ERR_PTR(-ENOMEM);
2272 rt->dst.flags |= DST_HOST;
2273 rt->dst.output = ip6_output;
2274 rt->rt6i_gateway = fl6->daddr;
2275 rt->rt6i_dst.addr = fl6->daddr;
2276 rt->rt6i_dst.plen = 128;
2277 rt->rt6i_idev = idev;
2278 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2280 /* Add this dst into uncached_list so that rt6_ifdown() can
2281 * do proper release of the net_device
2283 rt6_uncached_list_add(rt);
2285 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2291 static int ip6_dst_gc(struct dst_ops *ops)
2293 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2294 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2295 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2296 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2297 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2298 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2301 entries = dst_entries_get_fast(ops);
2302 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2303 entries <= rt_max_size)
2306 net->ipv6.ip6_rt_gc_expire++;
2307 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2308 entries = dst_entries_get_slow(ops);
2309 if (entries < ops->gc_thresh)
2310 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2312 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2313 return entries > rt_max_size;
2316 static int ip6_convert_metrics(struct mx6_config *mxc,
2317 const struct fib6_config *cfg)
2319 bool ecn_ca = false;
2327 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2331 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2332 int type = nla_type(nla);
2337 if (unlikely(type > RTAX_MAX))
2340 if (type == RTAX_CC_ALGO) {
2341 char tmp[TCP_CA_NAME_MAX];
2343 nla_strlcpy(tmp, nla, sizeof(tmp));
2344 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2345 if (val == TCP_CA_UNSPEC)
2348 val = nla_get_u32(nla);
2350 if (type == RTAX_HOPLIMIT && val > 255)
2352 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2356 __set_bit(type - 1, mxc->mx_valid);
2360 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2361 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2371 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2372 struct fib6_config *cfg,
2373 const struct in6_addr *gw_addr)
2375 struct flowi6 fl6 = {
2376 .flowi6_oif = cfg->fc_ifindex,
2378 .saddr = cfg->fc_prefsrc,
2380 struct fib6_table *table;
2381 struct rt6_info *rt;
2382 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2384 table = fib6_get_table(net, cfg->fc_table);
2388 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2389 flags |= RT6_LOOKUP_F_HAS_SADDR;
2391 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2393 /* if table lookup failed, fall back to full lookup */
2394 if (rt == net->ipv6.ip6_null_entry) {
2402 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2403 struct netlink_ext_ack *extack)
2405 struct net *net = cfg->fc_nlinfo.nl_net;
2406 struct rt6_info *rt = NULL;
2407 struct net_device *dev = NULL;
2408 struct inet6_dev *idev = NULL;
2409 struct fib6_table *table;
2413 /* RTF_PCPU is an internal flag; can not be set by userspace */
2414 if (cfg->fc_flags & RTF_PCPU) {
2415 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2419 if (cfg->fc_dst_len > 128) {
2420 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2423 if (cfg->fc_src_len > 128) {
2424 NL_SET_ERR_MSG(extack, "Invalid source address length");
2427 #ifndef CONFIG_IPV6_SUBTREES
2428 if (cfg->fc_src_len) {
2429 NL_SET_ERR_MSG(extack,
2430 "Specifying source address requires IPV6_SUBTREES to be enabled");
2434 if (cfg->fc_ifindex) {
2436 dev = dev_get_by_index(net, cfg->fc_ifindex);
2439 idev = in6_dev_get(dev);
2444 if (cfg->fc_metric == 0)
2445 cfg->fc_metric = IP6_RT_PRIO_USER;
2448 if (cfg->fc_nlinfo.nlh &&
2449 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2450 table = fib6_get_table(net, cfg->fc_table);
2452 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2453 table = fib6_new_table(net, cfg->fc_table);
2456 table = fib6_new_table(net, cfg->fc_table);
2462 rt = ip6_dst_alloc(net, NULL,
2463 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2470 if (cfg->fc_flags & RTF_EXPIRES)
2471 rt6_set_expires(rt, jiffies +
2472 clock_t_to_jiffies(cfg->fc_expires));
2474 rt6_clean_expires(rt);
2476 if (cfg->fc_protocol == RTPROT_UNSPEC)
2477 cfg->fc_protocol = RTPROT_BOOT;
2478 rt->rt6i_protocol = cfg->fc_protocol;
2480 addr_type = ipv6_addr_type(&cfg->fc_dst);
2482 if (addr_type & IPV6_ADDR_MULTICAST)
2483 rt->dst.input = ip6_mc_input;
2484 else if (cfg->fc_flags & RTF_LOCAL)
2485 rt->dst.input = ip6_input;
2487 rt->dst.input = ip6_forward;
2489 rt->dst.output = ip6_output;
2491 if (cfg->fc_encap) {
2492 struct lwtunnel_state *lwtstate;
2494 err = lwtunnel_build_state(cfg->fc_encap_type,
2495 cfg->fc_encap, AF_INET6, cfg,
2499 rt->dst.lwtstate = lwtstate_get(lwtstate);
2500 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2501 rt->dst.lwtstate->orig_output = rt->dst.output;
2502 rt->dst.output = lwtunnel_output;
2504 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2505 rt->dst.lwtstate->orig_input = rt->dst.input;
2506 rt->dst.input = lwtunnel_input;
2510 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2511 rt->rt6i_dst.plen = cfg->fc_dst_len;
2512 if (rt->rt6i_dst.plen == 128)
2513 rt->dst.flags |= DST_HOST;
2515 #ifdef CONFIG_IPV6_SUBTREES
2516 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2517 rt->rt6i_src.plen = cfg->fc_src_len;
2520 rt->rt6i_metric = cfg->fc_metric;
2522 /* We cannot add true routes via loopback here,
2523 they would result in kernel looping; promote them to reject routes
2525 if ((cfg->fc_flags & RTF_REJECT) ||
2526 (dev && (dev->flags & IFF_LOOPBACK) &&
2527 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2528 !(cfg->fc_flags & RTF_LOCAL))) {
2529 /* hold loopback dev/idev if we haven't done so. */
2530 if (dev != net->loopback_dev) {
2535 dev = net->loopback_dev;
2537 idev = in6_dev_get(dev);
2543 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2544 switch (cfg->fc_type) {
2546 rt->dst.error = -EINVAL;
2547 rt->dst.output = dst_discard_out;
2548 rt->dst.input = dst_discard;
2551 rt->dst.error = -EACCES;
2552 rt->dst.output = ip6_pkt_prohibit_out;
2553 rt->dst.input = ip6_pkt_prohibit;
2556 case RTN_UNREACHABLE:
2558 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2559 : (cfg->fc_type == RTN_UNREACHABLE)
2560 ? -EHOSTUNREACH : -ENETUNREACH;
2561 rt->dst.output = ip6_pkt_discard_out;
2562 rt->dst.input = ip6_pkt_discard;
2568 if (cfg->fc_flags & RTF_GATEWAY) {
2569 const struct in6_addr *gw_addr;
2572 gw_addr = &cfg->fc_gateway;
2573 gwa_type = ipv6_addr_type(gw_addr);
2575 /* if gw_addr is local we will fail to detect this in case
2576 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2577 * will return already-added prefix route via interface that
2578 * prefix route was assigned to, which might be non-loopback.
2581 if (ipv6_chk_addr_and_flags(net, gw_addr,
2582 gwa_type & IPV6_ADDR_LINKLOCAL ?
2583 dev : NULL, 0, 0)) {
2584 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2587 rt->rt6i_gateway = *gw_addr;
2589 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2590 struct rt6_info *grt = NULL;
2592 /* IPv6 strictly inhibits using not link-local
2593 addresses as nexthop address.
2594 Otherwise, router will not able to send redirects.
2595 It is very good, but in some (rare!) circumstances
2596 (SIT, PtP, NBMA NOARP links) it is handy to allow
2597 some exceptions. --ANK
2598 We allow IPv4-mapped nexthops to support RFC4798-type
2601 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2602 IPV6_ADDR_MAPPED))) {
2603 NL_SET_ERR_MSG(extack,
2604 "Invalid gateway address");
2608 if (cfg->fc_table) {
2609 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2612 if (grt->rt6i_flags & RTF_GATEWAY ||
2613 (dev && dev != grt->dst.dev)) {
2621 grt = rt6_lookup(net, gw_addr, NULL,
2622 cfg->fc_ifindex, 1);
2624 err = -EHOSTUNREACH;
2628 if (dev != grt->dst.dev) {
2634 idev = grt->rt6i_idev;
2636 in6_dev_hold(grt->rt6i_idev);
2638 if (!(grt->rt6i_flags & RTF_GATEWAY))
2647 NL_SET_ERR_MSG(extack, "Egress device not specified");
2649 } else if (dev->flags & IFF_LOOPBACK) {
2650 NL_SET_ERR_MSG(extack,
2651 "Egress device can not be loopback device for this route");
2660 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2661 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2662 NL_SET_ERR_MSG(extack, "Invalid source address");
2666 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2667 rt->rt6i_prefsrc.plen = 128;
2669 rt->rt6i_prefsrc.plen = 0;
2671 rt->rt6i_flags = cfg->fc_flags;
2675 rt->rt6i_idev = idev;
2676 rt->rt6i_table = table;
2678 cfg->fc_nlinfo.nl_net = dev_net(dev);
2687 dst_release_immediate(&rt->dst);
2689 return ERR_PTR(err);
2692 int ip6_route_add(struct fib6_config *cfg,
2693 struct netlink_ext_ack *extack)
2695 struct mx6_config mxc = { .mx = NULL, };
2696 struct rt6_info *rt;
2699 rt = ip6_route_info_create(cfg, extack);
2706 err = ip6_convert_metrics(&mxc, cfg);
2710 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2717 dst_release_immediate(&rt->dst);
2722 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2725 struct fib6_table *table;
2726 struct net *net = dev_net(rt->dst.dev);
2728 if (rt == net->ipv6.ip6_null_entry) {
2733 table = rt->rt6i_table;
2734 write_lock_bh(&table->tb6_lock);
2735 err = fib6_del(rt, info);
2736 write_unlock_bh(&table->tb6_lock);
2743 int ip6_del_rt(struct rt6_info *rt)
2745 struct nl_info info = {
2746 .nl_net = dev_net(rt->dst.dev),
2748 return __ip6_del_rt(rt, &info);
2751 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2753 struct nl_info *info = &cfg->fc_nlinfo;
2754 struct net *net = info->nl_net;
2755 struct sk_buff *skb = NULL;
2756 struct fib6_table *table;
2759 if (rt == net->ipv6.ip6_null_entry)
2761 table = rt->rt6i_table;
2762 write_lock_bh(&table->tb6_lock);
2764 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2765 struct rt6_info *sibling, *next_sibling;
2767 /* prefer to send a single notification with all hops */
2768 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2770 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2772 if (rt6_fill_node(net, skb, rt,
2773 NULL, NULL, 0, RTM_DELROUTE,
2774 info->portid, seq, 0) < 0) {
2778 info->skip_notify = 1;
2781 list_for_each_entry_safe(sibling, next_sibling,
2784 err = fib6_del(sibling, info);
2790 err = fib6_del(rt, info);
2792 write_unlock_bh(&table->tb6_lock);
2797 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2798 info->nlh, gfp_any());
2803 static int ip6_route_del(struct fib6_config *cfg,
2804 struct netlink_ext_ack *extack)
2806 struct rt6_info *rt, *rt_cache;
2807 struct fib6_table *table;
2808 struct fib6_node *fn;
2811 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2813 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2817 read_lock_bh(&table->tb6_lock);
2819 fn = fib6_locate(&table->tb6_root,
2820 &cfg->fc_dst, cfg->fc_dst_len,
2821 &cfg->fc_src, cfg->fc_src_len,
2822 !(cfg->fc_flags & RTF_CACHE));
2825 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2826 if (cfg->fc_flags & RTF_CACHE) {
2827 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2833 if (cfg->fc_ifindex &&
2835 rt->dst.dev->ifindex != cfg->fc_ifindex))
2837 if (cfg->fc_flags & RTF_GATEWAY &&
2838 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2840 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2842 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2845 read_unlock_bh(&table->tb6_lock);
2847 /* if gateway was specified only delete the one hop */
2848 if (cfg->fc_flags & RTF_GATEWAY)
2849 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2851 return __ip6_del_rt_siblings(rt, cfg);
2854 read_unlock_bh(&table->tb6_lock);
2859 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2861 struct netevent_redirect netevent;
2862 struct rt6_info *rt, *nrt = NULL;
2863 struct ndisc_options ndopts;
2864 struct inet6_dev *in6_dev;
2865 struct neighbour *neigh;
2867 int optlen, on_link;
2870 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2871 optlen -= sizeof(*msg);
2874 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2878 msg = (struct rd_msg *)icmp6_hdr(skb);
2880 if (ipv6_addr_is_multicast(&msg->dest)) {
2881 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2886 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2888 } else if (ipv6_addr_type(&msg->target) !=
2889 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2890 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2894 in6_dev = __in6_dev_get(skb->dev);
2897 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2901 * The IP source address of the Redirect MUST be the same as the current
2902 * first-hop router for the specified ICMP Destination Address.
2905 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2906 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2911 if (ndopts.nd_opts_tgt_lladdr) {
2912 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2915 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2920 rt = (struct rt6_info *) dst;
2921 if (rt->rt6i_flags & RTF_REJECT) {
2922 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2926 /* Redirect received -> path was valid.
2927 * Look, redirects are sent only in response to data packets,
2928 * so that this nexthop apparently is reachable. --ANK
2930 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2932 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2937 * We have finally decided to accept it.
2940 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2941 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2942 NEIGH_UPDATE_F_OVERRIDE|
2943 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2944 NEIGH_UPDATE_F_ISROUTER)),
2945 NDISC_REDIRECT, &ndopts);
2947 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2951 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2953 nrt->rt6i_flags &= ~RTF_GATEWAY;
2955 nrt->rt6i_protocol = RTPROT_REDIRECT;
2956 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2958 /* No need to remove rt from the exception table if rt is
2959 * a cached route because rt6_insert_exception() will
2962 if (rt6_insert_exception(nrt, rt)) {
2963 dst_release_immediate(&nrt->dst);
2967 netevent.old = &rt->dst;
2968 netevent.new = &nrt->dst;
2969 netevent.daddr = &msg->dest;
2970 netevent.neigh = neigh;
2971 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2974 neigh_release(neigh);
2978 * Misc support functions
2981 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2983 BUG_ON(from->dst.from);
2985 rt->rt6i_flags &= ~RTF_EXPIRES;
2986 dst_hold(&from->dst);
2987 rt->dst.from = &from->dst;
2988 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2991 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2993 rt->dst.input = ort->dst.input;
2994 rt->dst.output = ort->dst.output;
2995 rt->rt6i_dst = ort->rt6i_dst;
2996 rt->dst.error = ort->dst.error;
2997 rt->rt6i_idev = ort->rt6i_idev;
2999 in6_dev_hold(rt->rt6i_idev);
3000 rt->dst.lastuse = jiffies;
3001 rt->rt6i_gateway = ort->rt6i_gateway;
3002 rt->rt6i_flags = ort->rt6i_flags;
3003 rt6_set_from(rt, ort);
3004 rt->rt6i_metric = ort->rt6i_metric;
3005 #ifdef CONFIG_IPV6_SUBTREES
3006 rt->rt6i_src = ort->rt6i_src;
3008 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3009 rt->rt6i_table = ort->rt6i_table;
3010 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3013 #ifdef CONFIG_IPV6_ROUTE_INFO
3014 static struct rt6_info *rt6_get_route_info(struct net *net,
3015 const struct in6_addr *prefix, int prefixlen,
3016 const struct in6_addr *gwaddr,
3017 struct net_device *dev)
3019 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3020 int ifindex = dev->ifindex;
3021 struct fib6_node *fn;
3022 struct rt6_info *rt = NULL;
3023 struct fib6_table *table;
3025 table = fib6_get_table(net, tb_id);
3029 read_lock_bh(&table->tb6_lock);
3030 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3034 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
3035 if (rt->dst.dev->ifindex != ifindex)
3037 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3039 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3045 read_unlock_bh(&table->tb6_lock);
3049 static struct rt6_info *rt6_add_route_info(struct net *net,
3050 const struct in6_addr *prefix, int prefixlen,
3051 const struct in6_addr *gwaddr,
3052 struct net_device *dev,
3055 struct fib6_config cfg = {
3056 .fc_metric = IP6_RT_PRIO_USER,
3057 .fc_ifindex = dev->ifindex,
3058 .fc_dst_len = prefixlen,
3059 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3060 RTF_UP | RTF_PREF(pref),
3061 .fc_protocol = RTPROT_RA,
3062 .fc_nlinfo.portid = 0,
3063 .fc_nlinfo.nlh = NULL,
3064 .fc_nlinfo.nl_net = net,
3067 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3068 cfg.fc_dst = *prefix;
3069 cfg.fc_gateway = *gwaddr;
3071 /* We should treat it as a default route if prefix length is 0. */
3073 cfg.fc_flags |= RTF_DEFAULT;
3075 ip6_route_add(&cfg, NULL);
3077 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3081 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3083 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3084 struct rt6_info *rt;
3085 struct fib6_table *table;
3087 table = fib6_get_table(dev_net(dev), tb_id);
3091 read_lock_bh(&table->tb6_lock);
3092 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3093 if (dev == rt->dst.dev &&
3094 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3095 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3100 read_unlock_bh(&table->tb6_lock);
3104 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3105 struct net_device *dev,
3108 struct fib6_config cfg = {
3109 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3110 .fc_metric = IP6_RT_PRIO_USER,
3111 .fc_ifindex = dev->ifindex,
3112 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3113 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3114 .fc_protocol = RTPROT_RA,
3115 .fc_nlinfo.portid = 0,
3116 .fc_nlinfo.nlh = NULL,
3117 .fc_nlinfo.nl_net = dev_net(dev),
3120 cfg.fc_gateway = *gwaddr;
3122 if (!ip6_route_add(&cfg, NULL)) {
3123 struct fib6_table *table;
3125 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3127 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3130 return rt6_get_dflt_router(gwaddr, dev);
3133 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3135 struct rt6_info *rt;
3138 read_lock_bh(&table->tb6_lock);
3139 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
3140 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3141 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3143 read_unlock_bh(&table->tb6_lock);
3148 read_unlock_bh(&table->tb6_lock);
3150 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3153 void rt6_purge_dflt_routers(struct net *net)
3155 struct fib6_table *table;
3156 struct hlist_head *head;
3161 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3162 head = &net->ipv6.fib_table_hash[h];
3163 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3164 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3165 __rt6_purge_dflt_routers(table);
3172 static void rtmsg_to_fib6_config(struct net *net,
3173 struct in6_rtmsg *rtmsg,
3174 struct fib6_config *cfg)
3176 memset(cfg, 0, sizeof(*cfg));
3178 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3180 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3181 cfg->fc_metric = rtmsg->rtmsg_metric;
3182 cfg->fc_expires = rtmsg->rtmsg_info;
3183 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3184 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3185 cfg->fc_flags = rtmsg->rtmsg_flags;
3187 cfg->fc_nlinfo.nl_net = net;
3189 cfg->fc_dst = rtmsg->rtmsg_dst;
3190 cfg->fc_src = rtmsg->rtmsg_src;
3191 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3194 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3196 struct fib6_config cfg;
3197 struct in6_rtmsg rtmsg;
3201 case SIOCADDRT: /* Add a route */
3202 case SIOCDELRT: /* Delete a route */
3203 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3205 err = copy_from_user(&rtmsg, arg,
3206 sizeof(struct in6_rtmsg));
3210 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3215 err = ip6_route_add(&cfg, NULL);
3218 err = ip6_route_del(&cfg, NULL);
3232 * Drop the packet on the floor
3235 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3238 struct dst_entry *dst = skb_dst(skb);
3239 switch (ipstats_mib_noroutes) {
3240 case IPSTATS_MIB_INNOROUTES:
3241 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3242 if (type == IPV6_ADDR_ANY) {
3243 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3244 IPSTATS_MIB_INADDRERRORS);
3248 case IPSTATS_MIB_OUTNOROUTES:
3249 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3250 ipstats_mib_noroutes);
3253 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3258 static int ip6_pkt_discard(struct sk_buff *skb)
3260 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3263 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3265 skb->dev = skb_dst(skb)->dev;
3266 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3269 static int ip6_pkt_prohibit(struct sk_buff *skb)
3271 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3274 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3276 skb->dev = skb_dst(skb)->dev;
3277 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3281 * Allocate a dst for local (unicast / anycast) address.
3284 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3285 const struct in6_addr *addr,
3289 struct net *net = dev_net(idev->dev);
3290 struct net_device *dev = idev->dev;
3291 struct rt6_info *rt;
3293 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3295 return ERR_PTR(-ENOMEM);
3299 rt->dst.flags |= DST_HOST;
3300 rt->dst.input = ip6_input;
3301 rt->dst.output = ip6_output;
3302 rt->rt6i_idev = idev;
3304 rt->rt6i_protocol = RTPROT_KERNEL;
3305 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3307 rt->rt6i_flags |= RTF_ANYCAST;
3309 rt->rt6i_flags |= RTF_LOCAL;
3311 rt->rt6i_gateway = *addr;
3312 rt->rt6i_dst.addr = *addr;
3313 rt->rt6i_dst.plen = 128;
3314 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3315 rt->rt6i_table = fib6_get_table(net, tb_id);
3320 /* remove deleted ip from prefsrc entries */
3321 struct arg_dev_net_ip {
3322 struct net_device *dev;
3324 struct in6_addr *addr;
3327 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3329 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3330 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3331 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3333 if (((void *)rt->dst.dev == dev || !dev) &&
3334 rt != net->ipv6.ip6_null_entry &&
3335 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3336 spin_lock_bh(&rt6_exception_lock);
3337 /* remove prefsrc entry */
3338 rt->rt6i_prefsrc.plen = 0;
3339 /* need to update cache as well */
3340 rt6_exceptions_remove_prefsrc(rt);
3341 spin_unlock_bh(&rt6_exception_lock);
3346 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3348 struct net *net = dev_net(ifp->idev->dev);
3349 struct arg_dev_net_ip adni = {
3350 .dev = ifp->idev->dev,
3354 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3357 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3359 /* Remove routers and update dst entries when gateway turn into host. */
3360 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3362 struct in6_addr *gateway = (struct in6_addr *)arg;
3364 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3365 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3369 /* Further clean up cached routes in exception table.
3370 * This is needed because cached route may have a different
3371 * gateway than its 'parent' in the case of an ip redirect.
3373 rt6_exceptions_clean_tohost(rt, gateway);
3378 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3380 fib6_clean_all(net, fib6_clean_tohost, gateway);
3383 struct arg_dev_net {
3384 struct net_device *dev;
3388 /* called with write lock held for table with rt */
3389 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3391 const struct arg_dev_net *adn = arg;
3392 const struct net_device *dev = adn->dev;
3394 if ((rt->dst.dev == dev || !dev) &&
3395 rt != adn->net->ipv6.ip6_null_entry &&
3396 (rt->rt6i_nsiblings == 0 ||
3397 (dev && netdev_unregistering(dev)) ||
3398 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3404 void rt6_ifdown(struct net *net, struct net_device *dev)
3406 struct arg_dev_net adn = {
3411 fib6_clean_all(net, fib6_ifdown, &adn);
3413 rt6_uncached_list_flush_dev(net, dev);
3416 struct rt6_mtu_change_arg {
3417 struct net_device *dev;
3421 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3423 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3424 struct inet6_dev *idev;
3426 /* In IPv6 pmtu discovery is not optional,
3427 so that RTAX_MTU lock cannot disable it.
3428 We still use this lock to block changes
3429 caused by addrconf/ndisc.
3432 idev = __in6_dev_get(arg->dev);
3436 /* For administrative MTU increase, there is no way to discover
3437 IPv6 PMTU increase, so PMTU increase should be updated here.
3438 Since RFC 1981 doesn't include administrative MTU increase
3439 update PMTU increase is a MUST. (i.e. jumbo frame)
3442 If new MTU is less than route PMTU, this new MTU will be the
3443 lowest MTU in the path, update the route PMTU to reflect PMTU
3444 decreases; if new MTU is greater than route PMTU, and the
3445 old MTU is the lowest MTU in the path, update the route PMTU
3446 to reflect the increase. In this case if the other nodes' MTU
3447 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3450 if (rt->dst.dev == arg->dev &&
3451 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3452 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3453 spin_lock_bh(&rt6_exception_lock);
3454 if (dst_mtu(&rt->dst) >= arg->mtu ||
3455 (dst_mtu(&rt->dst) < arg->mtu &&
3456 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3457 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3459 rt6_exceptions_update_pmtu(rt, arg->mtu);
3460 spin_unlock_bh(&rt6_exception_lock);
3465 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3467 struct rt6_mtu_change_arg arg = {
3472 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3475 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3476 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3477 [RTA_OIF] = { .type = NLA_U32 },
3478 [RTA_IIF] = { .type = NLA_U32 },
3479 [RTA_PRIORITY] = { .type = NLA_U32 },
3480 [RTA_METRICS] = { .type = NLA_NESTED },
3481 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3482 [RTA_PREF] = { .type = NLA_U8 },
3483 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3484 [RTA_ENCAP] = { .type = NLA_NESTED },
3485 [RTA_EXPIRES] = { .type = NLA_U32 },
3486 [RTA_UID] = { .type = NLA_U32 },
3487 [RTA_MARK] = { .type = NLA_U32 },
3490 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3491 struct fib6_config *cfg,
3492 struct netlink_ext_ack *extack)
3495 struct nlattr *tb[RTA_MAX+1];
3499 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3505 rtm = nlmsg_data(nlh);
3506 memset(cfg, 0, sizeof(*cfg));
3508 cfg->fc_table = rtm->rtm_table;
3509 cfg->fc_dst_len = rtm->rtm_dst_len;
3510 cfg->fc_src_len = rtm->rtm_src_len;
3511 cfg->fc_flags = RTF_UP;
3512 cfg->fc_protocol = rtm->rtm_protocol;
3513 cfg->fc_type = rtm->rtm_type;
3515 if (rtm->rtm_type == RTN_UNREACHABLE ||
3516 rtm->rtm_type == RTN_BLACKHOLE ||
3517 rtm->rtm_type == RTN_PROHIBIT ||
3518 rtm->rtm_type == RTN_THROW)
3519 cfg->fc_flags |= RTF_REJECT;
3521 if (rtm->rtm_type == RTN_LOCAL)
3522 cfg->fc_flags |= RTF_LOCAL;
3524 if (rtm->rtm_flags & RTM_F_CLONED)
3525 cfg->fc_flags |= RTF_CACHE;
3527 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3528 cfg->fc_nlinfo.nlh = nlh;
3529 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3531 if (tb[RTA_GATEWAY]) {
3532 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3533 cfg->fc_flags |= RTF_GATEWAY;
3537 int plen = (rtm->rtm_dst_len + 7) >> 3;
3539 if (nla_len(tb[RTA_DST]) < plen)
3542 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3546 int plen = (rtm->rtm_src_len + 7) >> 3;
3548 if (nla_len(tb[RTA_SRC]) < plen)
3551 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3554 if (tb[RTA_PREFSRC])
3555 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3558 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3560 if (tb[RTA_PRIORITY])
3561 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3563 if (tb[RTA_METRICS]) {
3564 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3565 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3569 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3571 if (tb[RTA_MULTIPATH]) {
3572 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3573 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3575 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3576 cfg->fc_mp_len, extack);
3582 pref = nla_get_u8(tb[RTA_PREF]);
3583 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3584 pref != ICMPV6_ROUTER_PREF_HIGH)
3585 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3586 cfg->fc_flags |= RTF_PREF(pref);
3590 cfg->fc_encap = tb[RTA_ENCAP];
3592 if (tb[RTA_ENCAP_TYPE]) {
3593 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3595 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3600 if (tb[RTA_EXPIRES]) {
3601 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3603 if (addrconf_finite_timeout(timeout)) {
3604 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3605 cfg->fc_flags |= RTF_EXPIRES;
3615 struct rt6_info *rt6_info;
3616 struct fib6_config r_cfg;
3617 struct mx6_config mxc;
3618 struct list_head next;
3621 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3625 list_for_each_entry(nh, rt6_nh_list, next) {
3626 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3627 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3628 nh->r_cfg.fc_ifindex);
3632 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3633 struct rt6_info *rt, struct fib6_config *r_cfg)
3638 list_for_each_entry(nh, rt6_nh_list, next) {
3639 /* check if rt6_info already exists */
3640 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3644 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3648 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3653 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3654 list_add_tail(&nh->next, rt6_nh_list);
3659 static void ip6_route_mpath_notify(struct rt6_info *rt,
3660 struct rt6_info *rt_last,
3661 struct nl_info *info,
3664 /* if this is an APPEND route, then rt points to the first route
3665 * inserted and rt_last points to last route inserted. Userspace
3666 * wants a consistent dump of the route which starts at the first
3667 * nexthop. Since sibling routes are always added at the end of
3668 * the list, find the first sibling of the last route appended
3670 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3671 rt = list_first_entry(&rt_last->rt6i_siblings,
3677 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3680 static int ip6_route_multipath_add(struct fib6_config *cfg,
3681 struct netlink_ext_ack *extack)
3683 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3684 struct nl_info *info = &cfg->fc_nlinfo;
3685 struct fib6_config r_cfg;
3686 struct rtnexthop *rtnh;
3687 struct rt6_info *rt;
3688 struct rt6_nh *err_nh;
3689 struct rt6_nh *nh, *nh_safe;
3695 int replace = (cfg->fc_nlinfo.nlh &&
3696 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3697 LIST_HEAD(rt6_nh_list);
3699 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3700 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3701 nlflags |= NLM_F_APPEND;
3703 remaining = cfg->fc_mp_len;
3704 rtnh = (struct rtnexthop *)cfg->fc_mp;
3706 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3707 * rt6_info structs per nexthop
3709 while (rtnh_ok(rtnh, remaining)) {
3710 memcpy(&r_cfg, cfg, sizeof(*cfg));
3711 if (rtnh->rtnh_ifindex)
3712 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3714 attrlen = rtnh_attrlen(rtnh);
3716 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3718 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3720 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3721 r_cfg.fc_flags |= RTF_GATEWAY;
3723 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3724 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3726 r_cfg.fc_encap_type = nla_get_u16(nla);
3729 rt = ip6_route_info_create(&r_cfg, extack);
3736 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3738 dst_release_immediate(&rt->dst);
3742 rtnh = rtnh_next(rtnh, &remaining);
3745 /* for add and replace send one notification with all nexthops.
3746 * Skip the notification in fib6_add_rt2node and send one with
3747 * the full route when done
3749 info->skip_notify = 1;
3752 list_for_each_entry(nh, &rt6_nh_list, next) {
3753 rt_last = nh->rt6_info;
3754 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3755 /* save reference to first route for notification */
3756 if (!rt_notif && !err)
3757 rt_notif = nh->rt6_info;
3759 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3760 nh->rt6_info = NULL;
3763 ip6_print_replace_route_err(&rt6_nh_list);
3768 /* Because each route is added like a single route we remove
3769 * these flags after the first nexthop: if there is a collision,
3770 * we have already failed to add the first nexthop:
3771 * fib6_add_rt2node() has rejected it; when replacing, old
3772 * nexthops have been replaced by first new, the rest should
3775 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3780 /* success ... tell user about new route */
3781 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3785 /* send notification for routes that were added so that
3786 * the delete notifications sent by ip6_route_del are
3790 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3792 /* Delete routes that were already added */
3793 list_for_each_entry(nh, &rt6_nh_list, next) {
3796 ip6_route_del(&nh->r_cfg, extack);
3800 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3802 dst_release_immediate(&nh->rt6_info->dst);
3804 list_del(&nh->next);
3811 static int ip6_route_multipath_del(struct fib6_config *cfg,
3812 struct netlink_ext_ack *extack)
3814 struct fib6_config r_cfg;
3815 struct rtnexthop *rtnh;
3818 int err = 1, last_err = 0;
3820 remaining = cfg->fc_mp_len;
3821 rtnh = (struct rtnexthop *)cfg->fc_mp;
3823 /* Parse a Multipath Entry */
3824 while (rtnh_ok(rtnh, remaining)) {
3825 memcpy(&r_cfg, cfg, sizeof(*cfg));
3826 if (rtnh->rtnh_ifindex)
3827 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3829 attrlen = rtnh_attrlen(rtnh);
3831 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3833 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3835 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3836 r_cfg.fc_flags |= RTF_GATEWAY;
3839 err = ip6_route_del(&r_cfg, extack);
3843 rtnh = rtnh_next(rtnh, &remaining);
3849 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3850 struct netlink_ext_ack *extack)
3852 struct fib6_config cfg;
3855 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3860 return ip6_route_multipath_del(&cfg, extack);
3862 cfg.fc_delete_all_nh = 1;
3863 return ip6_route_del(&cfg, extack);
3867 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3868 struct netlink_ext_ack *extack)
3870 struct fib6_config cfg;
3873 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3878 return ip6_route_multipath_add(&cfg, extack);
3880 return ip6_route_add(&cfg, extack);
3883 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3885 int nexthop_len = 0;
3887 if (rt->rt6i_nsiblings) {
3888 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3889 + NLA_ALIGN(sizeof(struct rtnexthop))
3890 + nla_total_size(16) /* RTA_GATEWAY */
3891 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3893 nexthop_len *= rt->rt6i_nsiblings;
3896 return NLMSG_ALIGN(sizeof(struct rtmsg))
3897 + nla_total_size(16) /* RTA_SRC */
3898 + nla_total_size(16) /* RTA_DST */
3899 + nla_total_size(16) /* RTA_GATEWAY */
3900 + nla_total_size(16) /* RTA_PREFSRC */
3901 + nla_total_size(4) /* RTA_TABLE */
3902 + nla_total_size(4) /* RTA_IIF */
3903 + nla_total_size(4) /* RTA_OIF */
3904 + nla_total_size(4) /* RTA_PRIORITY */
3905 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3906 + nla_total_size(sizeof(struct rta_cacheinfo))
3907 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3908 + nla_total_size(1) /* RTA_PREF */
3909 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3913 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3914 unsigned int *flags, bool skip_oif)
3916 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3917 *flags |= RTNH_F_LINKDOWN;
3918 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3919 *flags |= RTNH_F_DEAD;
3922 if (rt->rt6i_flags & RTF_GATEWAY) {
3923 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3924 goto nla_put_failure;
3927 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3928 *flags |= RTNH_F_OFFLOAD;
3930 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3931 if (!skip_oif && rt->dst.dev &&
3932 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3933 goto nla_put_failure;
3935 if (rt->dst.lwtstate &&
3936 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3937 goto nla_put_failure;
3945 /* add multipath next hop */
3946 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3948 struct rtnexthop *rtnh;
3949 unsigned int flags = 0;
3951 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3953 goto nla_put_failure;
3955 rtnh->rtnh_hops = 0;
3956 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3958 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3959 goto nla_put_failure;
3961 rtnh->rtnh_flags = flags;
3963 /* length of rtnetlink header + attributes */
3964 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3972 static int rt6_fill_node(struct net *net,
3973 struct sk_buff *skb, struct rt6_info *rt,
3974 struct in6_addr *dst, struct in6_addr *src,
3975 int iif, int type, u32 portid, u32 seq,
3978 u32 metrics[RTAX_MAX];
3980 struct nlmsghdr *nlh;
3984 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3988 rtm = nlmsg_data(nlh);
3989 rtm->rtm_family = AF_INET6;
3990 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3991 rtm->rtm_src_len = rt->rt6i_src.plen;
3994 table = rt->rt6i_table->tb6_id;
3996 table = RT6_TABLE_UNSPEC;
3997 rtm->rtm_table = table;
3998 if (nla_put_u32(skb, RTA_TABLE, table))
3999 goto nla_put_failure;
4000 if (rt->rt6i_flags & RTF_REJECT) {
4001 switch (rt->dst.error) {
4003 rtm->rtm_type = RTN_BLACKHOLE;
4006 rtm->rtm_type = RTN_PROHIBIT;
4009 rtm->rtm_type = RTN_THROW;
4012 rtm->rtm_type = RTN_UNREACHABLE;
4016 else if (rt->rt6i_flags & RTF_LOCAL)
4017 rtm->rtm_type = RTN_LOCAL;
4018 else if (rt->rt6i_flags & RTF_ANYCAST)
4019 rtm->rtm_type = RTN_ANYCAST;
4020 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4021 rtm->rtm_type = RTN_LOCAL;
4023 rtm->rtm_type = RTN_UNICAST;
4025 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4026 rtm->rtm_protocol = rt->rt6i_protocol;
4028 if (rt->rt6i_flags & RTF_CACHE)
4029 rtm->rtm_flags |= RTM_F_CLONED;
4032 if (nla_put_in6_addr(skb, RTA_DST, dst))
4033 goto nla_put_failure;
4034 rtm->rtm_dst_len = 128;
4035 } else if (rtm->rtm_dst_len)
4036 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4037 goto nla_put_failure;
4038 #ifdef CONFIG_IPV6_SUBTREES
4040 if (nla_put_in6_addr(skb, RTA_SRC, src))
4041 goto nla_put_failure;
4042 rtm->rtm_src_len = 128;
4043 } else if (rtm->rtm_src_len &&
4044 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4045 goto nla_put_failure;
4048 #ifdef CONFIG_IPV6_MROUTE
4049 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4050 int err = ip6mr_get_route(net, skb, rtm, portid);
4055 goto nla_put_failure;
4058 if (nla_put_u32(skb, RTA_IIF, iif))
4059 goto nla_put_failure;
4061 struct in6_addr saddr_buf;
4062 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4063 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4064 goto nla_put_failure;
4067 if (rt->rt6i_prefsrc.plen) {
4068 struct in6_addr saddr_buf;
4069 saddr_buf = rt->rt6i_prefsrc.addr;
4070 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4071 goto nla_put_failure;
4074 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4076 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4077 if (rtnetlink_put_metrics(skb, metrics) < 0)
4078 goto nla_put_failure;
4080 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4081 goto nla_put_failure;
4083 /* For multipath routes, walk the siblings list and add
4084 * each as a nexthop within RTA_MULTIPATH.
4086 if (rt->rt6i_nsiblings) {
4087 struct rt6_info *sibling, *next_sibling;
4090 mp = nla_nest_start(skb, RTA_MULTIPATH);
4092 goto nla_put_failure;
4094 if (rt6_add_nexthop(skb, rt) < 0)
4095 goto nla_put_failure;
4097 list_for_each_entry_safe(sibling, next_sibling,
4098 &rt->rt6i_siblings, rt6i_siblings) {
4099 if (rt6_add_nexthop(skb, sibling) < 0)
4100 goto nla_put_failure;
4103 nla_nest_end(skb, mp);
4105 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4106 goto nla_put_failure;
4109 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4111 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4112 goto nla_put_failure;
4114 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4115 goto nla_put_failure;
4118 nlmsg_end(skb, nlh);
4122 nlmsg_cancel(skb, nlh);
4126 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4128 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4129 struct net *net = arg->net;
4131 if (rt == net->ipv6.ip6_null_entry)
4134 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4135 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4137 /* user wants prefix routes only */
4138 if (rtm->rtm_flags & RTM_F_PREFIX &&
4139 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4140 /* success since this is not a prefix route */
4145 return rt6_fill_node(net,
4146 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4147 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4151 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4152 struct netlink_ext_ack *extack)
4154 struct net *net = sock_net(in_skb->sk);
4155 struct nlattr *tb[RTA_MAX+1];
4156 int err, iif = 0, oif = 0;
4157 struct dst_entry *dst;
4158 struct rt6_info *rt;
4159 struct sk_buff *skb;
4164 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4170 memset(&fl6, 0, sizeof(fl6));
4171 rtm = nlmsg_data(nlh);
4172 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4173 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4176 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4179 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4183 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4186 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4190 iif = nla_get_u32(tb[RTA_IIF]);
4193 oif = nla_get_u32(tb[RTA_OIF]);
4196 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4199 fl6.flowi6_uid = make_kuid(current_user_ns(),
4200 nla_get_u32(tb[RTA_UID]));
4202 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4205 struct net_device *dev;
4210 dev = dev_get_by_index_rcu(net, iif);
4217 fl6.flowi6_iif = iif;
4219 if (!ipv6_addr_any(&fl6.saddr))
4220 flags |= RT6_LOOKUP_F_HAS_SADDR;
4223 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4225 dst = ip6_route_lookup(net, &fl6, 0);
4229 fl6.flowi6_oif = oif;
4232 dst = ip6_route_output(net, NULL, &fl6);
4234 dst = ip6_route_lookup(net, &fl6, 0);
4238 rt = container_of(dst, struct rt6_info, dst);
4239 if (rt->dst.error) {
4240 err = rt->dst.error;
4245 if (rt == net->ipv6.ip6_null_entry) {
4246 err = rt->dst.error;
4251 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4258 skb_dst_set(skb, &rt->dst);
4260 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4261 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4264 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4265 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4272 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4277 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4278 unsigned int nlm_flags)
4280 struct sk_buff *skb;
4281 struct net *net = info->nl_net;
4286 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4288 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4292 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4293 event, info->portid, seq, nlm_flags);
4295 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4296 WARN_ON(err == -EMSGSIZE);
4300 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4301 info->nlh, gfp_any());
4305 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4308 static int ip6_route_dev_notify(struct notifier_block *this,
4309 unsigned long event, void *ptr)
4311 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4312 struct net *net = dev_net(dev);
4314 if (!(dev->flags & IFF_LOOPBACK))
4317 if (event == NETDEV_REGISTER) {
4318 net->ipv6.ip6_null_entry->dst.dev = dev;
4319 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4320 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4321 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4322 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4323 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4324 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4326 } else if (event == NETDEV_UNREGISTER &&
4327 dev->reg_state != NETREG_UNREGISTERED) {
4328 /* NETDEV_UNREGISTER could be fired for multiple times by
4329 * netdev_wait_allrefs(). Make sure we only call this once.
4331 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4332 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4333 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4334 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4345 #ifdef CONFIG_PROC_FS
4347 static const struct file_operations ipv6_route_proc_fops = {
4348 .owner = THIS_MODULE,
4349 .open = ipv6_route_open,
4351 .llseek = seq_lseek,
4352 .release = seq_release_net,
4355 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4357 struct net *net = (struct net *)seq->private;
4358 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4359 net->ipv6.rt6_stats->fib_nodes,
4360 net->ipv6.rt6_stats->fib_route_nodes,
4361 net->ipv6.rt6_stats->fib_rt_alloc,
4362 net->ipv6.rt6_stats->fib_rt_entries,
4363 net->ipv6.rt6_stats->fib_rt_cache,
4364 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4365 net->ipv6.rt6_stats->fib_discarded_routes);
4370 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4372 return single_open_net(inode, file, rt6_stats_seq_show);
4375 static const struct file_operations rt6_stats_seq_fops = {
4376 .owner = THIS_MODULE,
4377 .open = rt6_stats_seq_open,
4379 .llseek = seq_lseek,
4380 .release = single_release_net,
4382 #endif /* CONFIG_PROC_FS */
4384 #ifdef CONFIG_SYSCTL
4387 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4388 void __user *buffer, size_t *lenp, loff_t *ppos)
4395 net = (struct net *)ctl->extra1;
4396 delay = net->ipv6.sysctl.flush_delay;
4397 proc_dointvec(ctl, write, buffer, lenp, ppos);
4398 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4402 struct ctl_table ipv6_route_table_template[] = {
4404 .procname = "flush",
4405 .data = &init_net.ipv6.sysctl.flush_delay,
4406 .maxlen = sizeof(int),
4408 .proc_handler = ipv6_sysctl_rtcache_flush
4411 .procname = "gc_thresh",
4412 .data = &ip6_dst_ops_template.gc_thresh,
4413 .maxlen = sizeof(int),
4415 .proc_handler = proc_dointvec,
4418 .procname = "max_size",
4419 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4420 .maxlen = sizeof(int),
4422 .proc_handler = proc_dointvec,
4425 .procname = "gc_min_interval",
4426 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4427 .maxlen = sizeof(int),
4429 .proc_handler = proc_dointvec_jiffies,
4432 .procname = "gc_timeout",
4433 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4434 .maxlen = sizeof(int),
4436 .proc_handler = proc_dointvec_jiffies,
4439 .procname = "gc_interval",
4440 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4441 .maxlen = sizeof(int),
4443 .proc_handler = proc_dointvec_jiffies,
4446 .procname = "gc_elasticity",
4447 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4448 .maxlen = sizeof(int),
4450 .proc_handler = proc_dointvec,
4453 .procname = "mtu_expires",
4454 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4455 .maxlen = sizeof(int),
4457 .proc_handler = proc_dointvec_jiffies,
4460 .procname = "min_adv_mss",
4461 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4462 .maxlen = sizeof(int),
4464 .proc_handler = proc_dointvec,
4467 .procname = "gc_min_interval_ms",
4468 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4469 .maxlen = sizeof(int),
4471 .proc_handler = proc_dointvec_ms_jiffies,
4476 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4478 struct ctl_table *table;
4480 table = kmemdup(ipv6_route_table_template,
4481 sizeof(ipv6_route_table_template),
4485 table[0].data = &net->ipv6.sysctl.flush_delay;
4486 table[0].extra1 = net;
4487 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4488 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4489 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4490 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4491 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4492 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4493 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4494 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4495 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4497 /* Don't export sysctls to unprivileged users */
4498 if (net->user_ns != &init_user_ns)
4499 table[0].procname = NULL;
4506 static int __net_init ip6_route_net_init(struct net *net)
4510 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4511 sizeof(net->ipv6.ip6_dst_ops));
4513 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4514 goto out_ip6_dst_ops;
4516 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4517 sizeof(*net->ipv6.ip6_null_entry),
4519 if (!net->ipv6.ip6_null_entry)
4520 goto out_ip6_dst_entries;
4521 net->ipv6.ip6_null_entry->dst.path =
4522 (struct dst_entry *)net->ipv6.ip6_null_entry;
4523 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4524 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4525 ip6_template_metrics, true);
4527 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4528 net->ipv6.fib6_has_custom_rules = false;
4529 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4530 sizeof(*net->ipv6.ip6_prohibit_entry),
4532 if (!net->ipv6.ip6_prohibit_entry)
4533 goto out_ip6_null_entry;
4534 net->ipv6.ip6_prohibit_entry->dst.path =
4535 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4536 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4537 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4538 ip6_template_metrics, true);
4540 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4541 sizeof(*net->ipv6.ip6_blk_hole_entry),
4543 if (!net->ipv6.ip6_blk_hole_entry)
4544 goto out_ip6_prohibit_entry;
4545 net->ipv6.ip6_blk_hole_entry->dst.path =
4546 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4547 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4548 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4549 ip6_template_metrics, true);
4552 net->ipv6.sysctl.flush_delay = 0;
4553 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4554 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4555 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4556 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4557 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4558 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4559 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4561 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4567 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4568 out_ip6_prohibit_entry:
4569 kfree(net->ipv6.ip6_prohibit_entry);
4571 kfree(net->ipv6.ip6_null_entry);
4573 out_ip6_dst_entries:
4574 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4579 static void __net_exit ip6_route_net_exit(struct net *net)
4581 kfree(net->ipv6.ip6_null_entry);
4582 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4583 kfree(net->ipv6.ip6_prohibit_entry);
4584 kfree(net->ipv6.ip6_blk_hole_entry);
4586 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4589 static int __net_init ip6_route_net_init_late(struct net *net)
4591 #ifdef CONFIG_PROC_FS
4592 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4593 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4598 static void __net_exit ip6_route_net_exit_late(struct net *net)
4600 #ifdef CONFIG_PROC_FS
4601 remove_proc_entry("ipv6_route", net->proc_net);
4602 remove_proc_entry("rt6_stats", net->proc_net);
4606 static struct pernet_operations ip6_route_net_ops = {
4607 .init = ip6_route_net_init,
4608 .exit = ip6_route_net_exit,
4611 static int __net_init ipv6_inetpeer_init(struct net *net)
4613 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4617 inet_peer_base_init(bp);
4618 net->ipv6.peers = bp;
4622 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4624 struct inet_peer_base *bp = net->ipv6.peers;
4626 net->ipv6.peers = NULL;
4627 inetpeer_invalidate_tree(bp);
4631 static struct pernet_operations ipv6_inetpeer_ops = {
4632 .init = ipv6_inetpeer_init,
4633 .exit = ipv6_inetpeer_exit,
4636 static struct pernet_operations ip6_route_net_late_ops = {
4637 .init = ip6_route_net_init_late,
4638 .exit = ip6_route_net_exit_late,
4641 static struct notifier_block ip6_route_dev_notifier = {
4642 .notifier_call = ip6_route_dev_notify,
4643 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4646 void __init ip6_route_init_special_entries(void)
4648 /* Registering of the loopback is done before this portion of code,
4649 * the loopback reference in rt6_info will not be taken, do it
4650 * manually for init_net */
4651 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4652 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4653 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4654 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4655 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4656 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4657 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4661 int __init ip6_route_init(void)
4667 ip6_dst_ops_template.kmem_cachep =
4668 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4669 SLAB_HWCACHE_ALIGN, NULL);
4670 if (!ip6_dst_ops_template.kmem_cachep)
4673 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4675 goto out_kmem_cache;
4677 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4679 goto out_dst_entries;
4681 ret = register_pernet_subsys(&ip6_route_net_ops);
4683 goto out_register_inetpeer;
4685 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4689 goto out_register_subsys;
4695 ret = fib6_rules_init();
4699 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4701 goto fib6_rules_init;
4704 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4705 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4706 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4707 RTNL_FLAG_DOIT_UNLOCKED))
4708 goto out_register_late_subsys;
4710 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4712 goto out_register_late_subsys;
4714 for_each_possible_cpu(cpu) {
4715 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4717 INIT_LIST_HEAD(&ul->head);
4718 spin_lock_init(&ul->lock);
4724 out_register_late_subsys:
4725 unregister_pernet_subsys(&ip6_route_net_late_ops);
4727 fib6_rules_cleanup();
4732 out_register_subsys:
4733 unregister_pernet_subsys(&ip6_route_net_ops);
4734 out_register_inetpeer:
4735 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4737 dst_entries_destroy(&ip6_dst_blackhole_ops);
4739 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4743 void ip6_route_cleanup(void)
4745 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4746 unregister_pernet_subsys(&ip6_route_net_late_ops);
4747 fib6_rules_cleanup();
4750 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4751 unregister_pernet_subsys(&ip6_route_net_ops);
4752 dst_entries_destroy(&ip6_dst_blackhole_ops);
4753 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);