2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
160 if (dev == loopback_dev)
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(rt->dst.from);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
213 return &ipv6_hdr(skb)->daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
221 struct rt6_info *rt = (struct rt6_info *) dst;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
282 static struct dst_ops ip6_dst_blackhole_ops = {
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
300 .__refcnt = ATOMIC_INIT(1),
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
388 EXPORT_SYMBOL(ip6_dst_alloc);
390 static void ip6_dst_destroy(struct dst_entry *dst)
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct dst_entry *from = dst->from;
395 struct inet6_dev *idev;
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
401 idev = rt->rt6i_idev;
403 rt->rt6i_idev = NULL;
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
408 rt->rt6i_exception_bucket = NULL;
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
427 rt->rt6i_idev = loopback_idev;
433 static bool __rt6_check_expired(const struct rt6_info *rt)
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
441 static bool rt6_check_expired(const struct rt6_info *rt)
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
446 } else if (rt->dst.from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired((struct rt6_info *)rt->dst.from);
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 struct flowi6 *fl6, int oif,
457 struct rt6_info *sibling, *next_sibling;
460 /* We might have already computed the hash for ICMPv6 errors. In such
461 * case it will always be non-zero. Otherwise now is the time to do it.
464 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
466 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467 /* Don't change the route, if route_choosen == 0
468 * (siblings does not include ourself)
471 list_for_each_entry_safe(sibling, next_sibling,
472 &match->rt6i_siblings, rt6i_siblings) {
474 if (route_choosen == 0) {
475 if (rt6_score_route(sibling, oif, strict) < 0)
485 * Route lookup. rcu_read_lock() should be held.
488 static inline struct rt6_info *rt6_device_match(struct net *net,
490 const struct in6_addr *saddr,
494 struct rt6_info *local = NULL;
495 struct rt6_info *sprt;
497 if (!oif && ipv6_addr_any(saddr))
500 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
501 struct net_device *dev = sprt->dst.dev;
504 if (dev->ifindex == oif)
506 if (dev->flags & IFF_LOOPBACK) {
507 if (!sprt->rt6i_idev ||
508 sprt->rt6i_idev->dev->ifindex != oif) {
509 if (flags & RT6_LOOKUP_F_IFACE)
512 local->rt6i_idev->dev->ifindex == oif)
518 if (ipv6_chk_addr(net, saddr, dev,
519 flags & RT6_LOOKUP_F_IFACE))
528 if (flags & RT6_LOOKUP_F_IFACE)
529 return net->ipv6.ip6_null_entry;
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 struct work_struct work;
538 struct in6_addr target;
539 struct net_device *dev;
542 static void rt6_probe_deferred(struct work_struct *w)
544 struct in6_addr mcaddr;
545 struct __rt6_probe_work *work =
546 container_of(w, struct __rt6_probe_work, work);
548 addrconf_addr_solict_mult(&work->target, &mcaddr);
549 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
554 static void rt6_probe(struct rt6_info *rt)
556 struct __rt6_probe_work *work;
557 struct neighbour *neigh;
559 * Okay, this does not seem to be appropriate
560 * for now, however, we need to check if it
561 * is really so; aka Router Reachability Probing.
563 * Router Reachability Probe MUST be rate-limited
564 * to no more than one per minute.
566 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
571 if (neigh->nud_state & NUD_VALID)
575 write_lock(&neigh->lock);
576 if (!(neigh->nud_state & NUD_VALID) &&
579 rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 __neigh_set_probe_once(neigh);
584 write_unlock(&neigh->lock);
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590 INIT_WORK(&work->work, rt6_probe_deferred);
591 work->target = rt->rt6i_gateway;
592 dev_hold(rt->dst.dev);
593 work->dev = rt->dst.dev;
594 schedule_work(&work->work);
598 rcu_read_unlock_bh();
601 static inline void rt6_probe(struct rt6_info *rt)
607 * Default Router Selection (RFC 2461 6.3.6)
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
611 struct net_device *dev = rt->dst.dev;
612 if (!oif || dev->ifindex == oif)
614 if ((dev->flags & IFF_LOOPBACK) &&
615 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
622 struct neighbour *neigh;
623 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
625 if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 !(rt->rt6i_flags & RTF_GATEWAY))
627 return RT6_NUD_SUCCEED;
630 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
632 read_lock(&neigh->lock);
633 if (neigh->nud_state & NUD_VALID)
634 ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 else if (!(neigh->nud_state & NUD_FAILED))
637 ret = RT6_NUD_SUCCEED;
639 ret = RT6_NUD_FAIL_PROBE;
641 read_unlock(&neigh->lock);
643 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
646 rcu_read_unlock_bh();
651 static int rt6_score_route(struct rt6_info *rt, int oif,
656 m = rt6_check_dev(rt, oif);
657 if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
662 if (strict & RT6_LOOKUP_F_REACHABLE) {
663 int n = rt6_check_neigh(rt);
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 int *mpri, struct rt6_info *match,
675 bool match_do_rr = false;
676 struct inet6_dev *idev = rt->rt6i_idev;
677 struct net_device *dev = rt->dst.dev;
679 if (dev && !netif_carrier_ok(dev) &&
680 idev->cnf.ignore_routes_with_linkdown &&
681 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 if (rt6_check_expired(rt))
687 m = rt6_score_route(rt, oif, strict);
688 if (m == RT6_NUD_FAIL_DO_RR) {
690 m = 0; /* lowest valid score */
691 } else if (m == RT6_NUD_FAIL_HARD) {
695 if (strict & RT6_LOOKUP_F_REACHABLE)
698 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
700 *do_rr = match_do_rr;
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709 struct rt6_info *leaf,
710 struct rt6_info *rr_head,
711 u32 metric, int oif, int strict,
714 struct rt6_info *rt, *match, *cont;
719 for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
720 if (rt->rt6i_metric != metric) {
725 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 for (rt = leaf; rt && rt != rr_head;
729 rt = rcu_dereference(rt->dst.rt6_next)) {
730 if (rt->rt6i_metric != metric) {
735 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741 for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
742 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 struct rt6_info *leaf = rcu_dereference(fn->leaf);
751 struct rt6_info *match, *rt0;
756 return net->ipv6.ip6_null_entry;
758 rt0 = rcu_dereference(fn->rr_ptr);
762 /* Double check to make sure fn is not an intermediate node
763 * and fn->leaf does not points to its child's leaf
764 * (This might happen if all routes under fn are deleted from
765 * the tree and fib6_repair_tree() is called on the node.)
767 key_plen = rt0->rt6i_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769 if (rt0->rt6i_src.plen)
770 key_plen = rt0->rt6i_src.plen;
772 if (fn->fn_bit != key_plen)
773 return net->ipv6.ip6_null_entry;
775 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
779 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
781 /* no entries matched; do round-robin */
782 if (!next || next->rt6i_metric != rt0->rt6i_metric)
786 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
787 /* make sure next is not being deleted from the tree */
789 rcu_assign_pointer(fn->rr_ptr, next);
790 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
794 return match ? match : net->ipv6.ip6_null_entry;
797 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
799 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 #ifdef CONFIG_IPV6_ROUTE_INFO
803 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
804 const struct in6_addr *gwaddr)
806 struct net *net = dev_net(dev);
807 struct route_info *rinfo = (struct route_info *) opt;
808 struct in6_addr prefix_buf, *prefix;
810 unsigned long lifetime;
813 if (len < sizeof(struct route_info)) {
817 /* Sanity check for prefix_len and length */
818 if (rinfo->length > 3) {
820 } else if (rinfo->prefix_len > 128) {
822 } else if (rinfo->prefix_len > 64) {
823 if (rinfo->length < 2) {
826 } else if (rinfo->prefix_len > 0) {
827 if (rinfo->length < 1) {
832 pref = rinfo->route_pref;
833 if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
838 if (rinfo->length == 3)
839 prefix = (struct in6_addr *)rinfo->prefix;
841 /* this function is safe */
842 ipv6_addr_prefix(&prefix_buf,
843 (struct in6_addr *)rinfo->prefix,
845 prefix = &prefix_buf;
848 if (rinfo->prefix_len == 0)
849 rt = rt6_get_dflt_router(gwaddr, dev);
851 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 if (rt && !lifetime) {
860 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 rt->rt6i_flags = RTF_ROUTEINFO |
864 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867 if (!addrconf_finite_timeout(lifetime))
868 rt6_clean_expires(rt);
870 rt6_set_expires(rt, jiffies + HZ * lifetime);
878 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
879 struct in6_addr *saddr)
881 struct fib6_node *pn, *sn;
883 if (fn->fn_flags & RTN_TL_ROOT)
885 pn = rcu_dereference(fn->parent);
886 sn = FIB6_SUBTREE(pn);
888 fn = fib6_lookup(sn, NULL, saddr);
891 if (fn->fn_flags & RTN_RTINFO)
896 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 struct rt6_info *rt = *prt;
901 if (dst_hold_safe(&rt->dst))
904 rt = net->ipv6.ip6_null_entry;
913 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
914 struct fib6_table *table,
915 struct flowi6 *fl6, int flags)
917 struct rt6_info *rt, *rt_cache;
918 struct fib6_node *fn;
921 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
923 rt = rcu_dereference(fn->leaf);
925 rt = net->ipv6.ip6_null_entry;
927 rt = rt6_device_match(net, rt, &fl6->saddr,
928 fl6->flowi6_oif, flags);
929 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
930 rt = rt6_multipath_select(rt, fl6,
931 fl6->flowi6_oif, flags);
933 if (rt == net->ipv6.ip6_null_entry) {
934 fn = fib6_backtrack(fn, &fl6->saddr);
938 /* Search through exception table */
939 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
943 if (ip6_hold_safe(net, &rt, true))
944 dst_use_noref(&rt->dst, jiffies);
948 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
954 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
959 EXPORT_SYMBOL_GPL(ip6_route_lookup);
961 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
962 const struct in6_addr *saddr, int oif, int strict)
964 struct flowi6 fl6 = {
968 struct dst_entry *dst;
969 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
973 flags |= RT6_LOOKUP_F_HAS_SADDR;
976 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
978 return (struct rt6_info *) dst;
984 EXPORT_SYMBOL(rt6_lookup);
986 /* ip6_ins_rt is called with FREE table->tb6_lock.
987 * It takes new route entry, the addition fails by any reason the
989 * Caller must hold dst before calling it.
992 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
993 struct mx6_config *mxc,
994 struct netlink_ext_ack *extack)
997 struct fib6_table *table;
999 table = rt->rt6i_table;
1000 spin_lock_bh(&table->tb6_lock);
1001 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1002 spin_unlock_bh(&table->tb6_lock);
1007 int ip6_ins_rt(struct rt6_info *rt)
1009 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1010 struct mx6_config mxc = { .mx = NULL, };
1012 /* Hold dst to account for the reference from the fib6 tree */
1014 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 /* called with rcu_lock held */
1018 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1020 struct net_device *dev = rt->dst.dev;
1022 if (rt->rt6i_flags & RTF_LOCAL) {
1023 /* for copies of local routes, dst->dev needs to be the
1024 * device if it is a master device, the master device if
1025 * device is enslaved, and the loopback as the default
1027 if (netif_is_l3_slave(dev) &&
1028 !rt6_need_strict(&rt->rt6i_dst.addr))
1029 dev = l3mdev_master_dev_rcu(dev);
1030 else if (!netif_is_l3_master(dev))
1031 dev = dev_net(dev)->loopback_dev;
1032 /* last case is netif_is_l3_master(dev) is true in which
1033 * case we want dev returned to be dev
1040 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1041 const struct in6_addr *daddr,
1042 const struct in6_addr *saddr)
1044 struct net_device *dev;
1045 struct rt6_info *rt;
1051 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1052 ort = (struct rt6_info *)ort->dst.from;
1055 dev = ip6_rt_get_dev_rcu(ort);
1056 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1061 ip6_rt_copy_init(rt, ort);
1062 rt->rt6i_flags |= RTF_CACHE;
1063 rt->rt6i_metric = 0;
1064 rt->dst.flags |= DST_HOST;
1065 rt->rt6i_dst.addr = *daddr;
1066 rt->rt6i_dst.plen = 128;
1068 if (!rt6_is_gw_or_nonexthop(ort)) {
1069 if (ort->rt6i_dst.plen != 128 &&
1070 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1071 rt->rt6i_flags |= RTF_ANYCAST;
1072 #ifdef CONFIG_IPV6_SUBTREES
1073 if (rt->rt6i_src.plen && saddr) {
1074 rt->rt6i_src.addr = *saddr;
1075 rt->rt6i_src.plen = 128;
1083 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1085 struct net_device *dev;
1086 struct rt6_info *pcpu_rt;
1089 dev = ip6_rt_get_dev_rcu(rt);
1090 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1094 ip6_rt_copy_init(pcpu_rt, rt);
1095 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1096 pcpu_rt->rt6i_flags |= RTF_PCPU;
1100 /* It should be called with rcu_read_lock() acquired */
1101 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1103 struct rt6_info *pcpu_rt, **p;
1105 p = this_cpu_ptr(rt->rt6i_pcpu);
1108 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1109 rt6_dst_from_metrics_check(pcpu_rt);
1114 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1116 struct rt6_info *pcpu_rt, *prev, **p;
1118 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1120 struct net *net = dev_net(rt->dst.dev);
1122 dst_hold(&net->ipv6.ip6_null_entry->dst);
1123 return net->ipv6.ip6_null_entry;
1126 dst_hold(&pcpu_rt->dst);
1127 p = this_cpu_ptr(rt->rt6i_pcpu);
1128 prev = cmpxchg(p, NULL, pcpu_rt);
1131 rt6_dst_from_metrics_check(pcpu_rt);
1135 /* exception hash table implementation
1137 static DEFINE_SPINLOCK(rt6_exception_lock);
1139 /* Remove rt6_ex from hash table and free the memory
1140 * Caller must hold rt6_exception_lock
1142 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1143 struct rt6_exception *rt6_ex)
1145 struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1147 if (!bucket || !rt6_ex)
1149 rt6_ex->rt6i->rt6i_node = NULL;
1150 hlist_del_rcu(&rt6_ex->hlist);
1151 rt6_release(rt6_ex->rt6i);
1152 kfree_rcu(rt6_ex, rcu);
1153 WARN_ON_ONCE(!bucket->depth);
1155 net->ipv6.rt6_stats->fib_rt_cache--;
1158 /* Remove oldest rt6_ex in bucket and free the memory
1159 * Caller must hold rt6_exception_lock
1161 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1163 struct rt6_exception *rt6_ex, *oldest = NULL;
1168 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1169 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1172 rt6_remove_exception(bucket, oldest);
1175 static u32 rt6_exception_hash(const struct in6_addr *dst,
1176 const struct in6_addr *src)
1178 static u32 seed __read_mostly;
1181 net_get_random_once(&seed, sizeof(seed));
1182 val = jhash(dst, sizeof(*dst), seed);
1184 #ifdef CONFIG_IPV6_SUBTREES
1186 val = jhash(src, sizeof(*src), val);
1188 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1191 /* Helper function to find the cached rt in the hash table
1192 * and update bucket pointer to point to the bucket for this
1193 * (daddr, saddr) pair
1194 * Caller must hold rt6_exception_lock
1196 static struct rt6_exception *
1197 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1198 const struct in6_addr *daddr,
1199 const struct in6_addr *saddr)
1201 struct rt6_exception *rt6_ex;
1204 if (!(*bucket) || !daddr)
1207 hval = rt6_exception_hash(daddr, saddr);
1210 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1211 struct rt6_info *rt6 = rt6_ex->rt6i;
1212 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1214 #ifdef CONFIG_IPV6_SUBTREES
1215 if (matched && saddr)
1216 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 /* Helper function to find the cached rt in the hash table
1225 * and update bucket pointer to point to the bucket for this
1226 * (daddr, saddr) pair
1227 * Caller must hold rcu_read_lock()
1229 static struct rt6_exception *
1230 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1231 const struct in6_addr *daddr,
1232 const struct in6_addr *saddr)
1234 struct rt6_exception *rt6_ex;
1237 WARN_ON_ONCE(!rcu_read_lock_held());
1239 if (!(*bucket) || !daddr)
1242 hval = rt6_exception_hash(daddr, saddr);
1245 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1246 struct rt6_info *rt6 = rt6_ex->rt6i;
1247 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1249 #ifdef CONFIG_IPV6_SUBTREES
1250 if (matched && saddr)
1251 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 static int rt6_insert_exception(struct rt6_info *nrt,
1260 struct rt6_info *ort)
1262 struct net *net = dev_net(ort->dst.dev);
1263 struct rt6_exception_bucket *bucket;
1264 struct in6_addr *src_key = NULL;
1265 struct rt6_exception *rt6_ex;
1268 /* ort can't be a cache or pcpu route */
1269 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1270 ort = (struct rt6_info *)ort->dst.from;
1271 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1273 spin_lock_bh(&rt6_exception_lock);
1275 if (ort->exception_bucket_flushed) {
1280 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1281 lockdep_is_held(&rt6_exception_lock));
1283 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1289 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1292 #ifdef CONFIG_IPV6_SUBTREES
1293 /* rt6i_src.plen != 0 indicates ort is in subtree
1294 * and exception table is indexed by a hash of
1295 * both rt6i_dst and rt6i_src.
1296 * Otherwise, the exception table is indexed by
1297 * a hash of only rt6i_dst.
1299 if (ort->rt6i_src.plen)
1300 src_key = &nrt->rt6i_src.addr;
1303 /* Update rt6i_prefsrc as it could be changed
1304 * in rt6_remove_prefsrc()
1306 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1307 /* rt6_mtu_change() might lower mtu on ort.
1308 * Only insert this exception route if its mtu
1309 * is less than ort's mtu value.
1311 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1319 rt6_remove_exception(bucket, rt6_ex);
1321 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1327 rt6_ex->stamp = jiffies;
1328 atomic_inc(&nrt->rt6i_ref);
1329 nrt->rt6i_node = ort->rt6i_node;
1330 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1332 net->ipv6.rt6_stats->fib_rt_cache++;
1334 if (bucket->depth > FIB6_MAX_DEPTH)
1335 rt6_exception_remove_oldest(bucket);
1338 spin_unlock_bh(&rt6_exception_lock);
1340 /* Update fn->fn_sernum to invalidate all cached dst */
1342 fib6_update_sernum(ort);
1347 void rt6_flush_exceptions(struct rt6_info *rt)
1349 struct rt6_exception_bucket *bucket;
1350 struct rt6_exception *rt6_ex;
1351 struct hlist_node *tmp;
1354 spin_lock_bh(&rt6_exception_lock);
1355 /* Prevent rt6_insert_exception() to recreate the bucket list */
1356 rt->exception_bucket_flushed = 1;
1358 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1359 lockdep_is_held(&rt6_exception_lock));
1363 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1364 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1365 rt6_remove_exception(bucket, rt6_ex);
1366 WARN_ON_ONCE(bucket->depth);
1371 spin_unlock_bh(&rt6_exception_lock);
1374 /* Find cached rt in the hash table inside passed in rt
1375 * Caller has to hold rcu_read_lock()
1377 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1378 struct in6_addr *daddr,
1379 struct in6_addr *saddr)
1381 struct rt6_exception_bucket *bucket;
1382 struct in6_addr *src_key = NULL;
1383 struct rt6_exception *rt6_ex;
1384 struct rt6_info *res = NULL;
1386 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1388 #ifdef CONFIG_IPV6_SUBTREES
1389 /* rt6i_src.plen != 0 indicates rt is in subtree
1390 * and exception table is indexed by a hash of
1391 * both rt6i_dst and rt6i_src.
1392 * Otherwise, the exception table is indexed by
1393 * a hash of only rt6i_dst.
1395 if (rt->rt6i_src.plen)
1398 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1400 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1406 /* Remove the passed in cached rt from the hash table that contains it */
1407 int rt6_remove_exception_rt(struct rt6_info *rt)
1409 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1410 struct rt6_exception_bucket *bucket;
1411 struct in6_addr *src_key = NULL;
1412 struct rt6_exception *rt6_ex;
1416 !(rt->rt6i_flags | RTF_CACHE))
1419 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1422 spin_lock_bh(&rt6_exception_lock);
1423 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1424 lockdep_is_held(&rt6_exception_lock));
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1427 * and exception table is indexed by a hash of
1428 * both rt6i_dst and rt6i_src.
1429 * Otherwise, the exception table is indexed by
1430 * a hash of only rt6i_dst.
1432 if (from->rt6i_src.plen)
1433 src_key = &rt->rt6i_src.addr;
1435 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1439 rt6_remove_exception(bucket, rt6_ex);
1445 spin_unlock_bh(&rt6_exception_lock);
1449 /* Find rt6_ex which contains the passed in rt cache and
1452 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1454 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1455 struct rt6_exception_bucket *bucket;
1456 struct in6_addr *src_key = NULL;
1457 struct rt6_exception *rt6_ex;
1460 !(rt->rt6i_flags | RTF_CACHE))
1464 bucket = rcu_dereference(from->rt6i_exception_bucket);
1466 #ifdef CONFIG_IPV6_SUBTREES
1467 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1468 * and exception table is indexed by a hash of
1469 * both rt6i_dst and rt6i_src.
1470 * Otherwise, the exception table is indexed by
1471 * a hash of only rt6i_dst.
1473 if (from->rt6i_src.plen)
1474 src_key = &rt->rt6i_src.addr;
1476 rt6_ex = __rt6_find_exception_rcu(&bucket,
1480 rt6_ex->stamp = jiffies;
1485 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1487 struct rt6_exception_bucket *bucket;
1488 struct rt6_exception *rt6_ex;
1491 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1492 lockdep_is_held(&rt6_exception_lock));
1495 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1496 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1497 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1504 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1506 struct rt6_exception_bucket *bucket;
1507 struct rt6_exception *rt6_ex;
1510 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1511 lockdep_is_held(&rt6_exception_lock));
1514 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1515 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1516 struct rt6_info *entry = rt6_ex->rt6i;
1517 /* For RTF_CACHE with rt6i_pmtu == 0
1518 * (i.e. a redirected route),
1519 * the metrics of its rt->dst.from has already
1522 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1523 entry->rt6i_pmtu = mtu;
1530 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1532 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1533 struct in6_addr *gateway)
1535 struct rt6_exception_bucket *bucket;
1536 struct rt6_exception *rt6_ex;
1537 struct hlist_node *tmp;
1540 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1543 spin_lock_bh(&rt6_exception_lock);
1544 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1545 lockdep_is_held(&rt6_exception_lock));
1548 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1549 hlist_for_each_entry_safe(rt6_ex, tmp,
1550 &bucket->chain, hlist) {
1551 struct rt6_info *entry = rt6_ex->rt6i;
1553 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1554 RTF_CACHE_GATEWAY &&
1555 ipv6_addr_equal(gateway,
1556 &entry->rt6i_gateway)) {
1557 rt6_remove_exception(bucket, rt6_ex);
1564 spin_unlock_bh(&rt6_exception_lock);
1567 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1568 struct rt6_exception *rt6_ex,
1569 struct fib6_gc_args *gc_args,
1572 struct rt6_info *rt = rt6_ex->rt6i;
1574 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1575 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1576 RT6_TRACE("aging clone %p\n", rt);
1577 rt6_remove_exception(bucket, rt6_ex);
1579 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1580 struct neighbour *neigh;
1581 __u8 neigh_flags = 0;
1583 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1585 neigh_flags = neigh->flags;
1586 neigh_release(neigh);
1588 if (!(neigh_flags & NTF_ROUTER)) {
1589 RT6_TRACE("purging route %p via non-router but gateway\n",
1591 rt6_remove_exception(bucket, rt6_ex);
1598 void rt6_age_exceptions(struct rt6_info *rt,
1599 struct fib6_gc_args *gc_args,
1602 struct rt6_exception_bucket *bucket;
1603 struct rt6_exception *rt6_ex;
1604 struct hlist_node *tmp;
1607 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1610 spin_lock_bh(&rt6_exception_lock);
1611 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1612 lockdep_is_held(&rt6_exception_lock));
1615 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1616 hlist_for_each_entry_safe(rt6_ex, tmp,
1617 &bucket->chain, hlist) {
1618 rt6_age_examine_exception(bucket, rt6_ex,
1624 spin_unlock_bh(&rt6_exception_lock);
1627 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1628 int oif, struct flowi6 *fl6, int flags)
1630 struct fib6_node *fn, *saved_fn;
1631 struct rt6_info *rt, *rt_cache;
1634 strict |= flags & RT6_LOOKUP_F_IFACE;
1635 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1636 if (net->ipv6.devconf_all->forwarding == 0)
1637 strict |= RT6_LOOKUP_F_REACHABLE;
1641 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1644 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1648 rt = rt6_select(net, fn, oif, strict);
1649 if (rt->rt6i_nsiblings)
1650 rt = rt6_multipath_select(rt, fl6, oif, strict);
1651 if (rt == net->ipv6.ip6_null_entry) {
1652 fn = fib6_backtrack(fn, &fl6->saddr);
1654 goto redo_rt6_select;
1655 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1656 /* also consider unreachable route */
1657 strict &= ~RT6_LOOKUP_F_REACHABLE;
1659 goto redo_rt6_select;
1663 /*Search through exception table */
1664 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1668 if (rt == net->ipv6.ip6_null_entry) {
1671 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1673 } else if (rt->rt6i_flags & RTF_CACHE) {
1674 if (ip6_hold_safe(net, &rt, true)) {
1675 dst_use_noref(&rt->dst, jiffies);
1676 rt6_dst_from_metrics_check(rt);
1679 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1681 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1682 !(rt->rt6i_flags & RTF_GATEWAY))) {
1683 /* Create a RTF_CACHE clone which will not be
1684 * owned by the fib6 tree. It is for the special case where
1685 * the daddr in the skb during the neighbor look-up is different
1686 * from the fl6->daddr used to look-up route here.
1689 struct rt6_info *uncached_rt;
1691 if (ip6_hold_safe(net, &rt, true)) {
1692 dst_use_noref(&rt->dst, jiffies);
1696 goto uncached_rt_out;
1700 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1701 dst_release(&rt->dst);
1704 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1705 * No need for another dst_hold()
1707 rt6_uncached_list_add(uncached_rt);
1708 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1710 uncached_rt = net->ipv6.ip6_null_entry;
1711 dst_hold(&uncached_rt->dst);
1715 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1719 /* Get a percpu copy */
1721 struct rt6_info *pcpu_rt;
1723 dst_use_noref(&rt->dst, jiffies);
1725 pcpu_rt = rt6_get_pcpu_route(rt);
1728 /* atomic_inc_not_zero() is needed when using rcu */
1729 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1730 /* No dst_hold() on rt is needed because grabbing
1731 * rt->rt6i_ref makes sure rt can't be released.
1733 pcpu_rt = rt6_make_pcpu_route(rt);
1736 /* rt is already removed from tree */
1737 pcpu_rt = net->ipv6.ip6_null_entry;
1738 dst_hold(&pcpu_rt->dst);
1743 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1747 EXPORT_SYMBOL_GPL(ip6_pol_route);
1749 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1750 struct flowi6 *fl6, int flags)
1752 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1755 struct dst_entry *ip6_route_input_lookup(struct net *net,
1756 struct net_device *dev,
1757 struct flowi6 *fl6, int flags)
1759 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1760 flags |= RT6_LOOKUP_F_IFACE;
1762 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1764 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1766 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1767 struct flow_keys *keys)
1769 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1770 const struct ipv6hdr *key_iph = outer_iph;
1771 const struct ipv6hdr *inner_iph;
1772 const struct icmp6hdr *icmph;
1773 struct ipv6hdr _inner_iph;
1775 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1778 icmph = icmp6_hdr(skb);
1779 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1780 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1781 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1782 icmph->icmp6_type != ICMPV6_PARAMPROB)
1785 inner_iph = skb_header_pointer(skb,
1786 skb_transport_offset(skb) + sizeof(*icmph),
1787 sizeof(_inner_iph), &_inner_iph);
1791 key_iph = inner_iph;
1793 memset(keys, 0, sizeof(*keys));
1794 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1795 keys->addrs.v6addrs.src = key_iph->saddr;
1796 keys->addrs.v6addrs.dst = key_iph->daddr;
1797 keys->tags.flow_label = ip6_flowinfo(key_iph);
1798 keys->basic.ip_proto = key_iph->nexthdr;
1801 /* if skb is set it will be used and fl6 can be NULL */
1802 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1804 struct flow_keys hash_keys;
1807 ip6_multipath_l3_keys(skb, &hash_keys);
1808 return flow_hash_from_keys(&hash_keys);
1811 return get_hash_from_flowi6(fl6);
1814 void ip6_route_input(struct sk_buff *skb)
1816 const struct ipv6hdr *iph = ipv6_hdr(skb);
1817 struct net *net = dev_net(skb->dev);
1818 int flags = RT6_LOOKUP_F_HAS_SADDR;
1819 struct ip_tunnel_info *tun_info;
1820 struct flowi6 fl6 = {
1821 .flowi6_iif = skb->dev->ifindex,
1822 .daddr = iph->daddr,
1823 .saddr = iph->saddr,
1824 .flowlabel = ip6_flowinfo(iph),
1825 .flowi6_mark = skb->mark,
1826 .flowi6_proto = iph->nexthdr,
1829 tun_info = skb_tunnel_info(skb);
1830 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1831 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1832 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1833 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1835 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1838 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1839 struct flowi6 *fl6, int flags)
1841 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1844 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1845 struct flowi6 *fl6, int flags)
1849 if (rt6_need_strict(&fl6->daddr)) {
1850 struct dst_entry *dst;
1852 dst = l3mdev_link_scope_lookup(net, fl6);
1857 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1859 any_src = ipv6_addr_any(&fl6->saddr);
1860 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1861 (fl6->flowi6_oif && any_src))
1862 flags |= RT6_LOOKUP_F_IFACE;
1865 flags |= RT6_LOOKUP_F_HAS_SADDR;
1867 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1869 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1871 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1873 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1875 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1876 struct net_device *loopback_dev = net->loopback_dev;
1877 struct dst_entry *new = NULL;
1879 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1880 DST_OBSOLETE_NONE, 0);
1883 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1887 new->input = dst_discard;
1888 new->output = dst_discard_out;
1890 dst_copy_metrics(new, &ort->dst);
1892 rt->rt6i_idev = in6_dev_get(loopback_dev);
1893 rt->rt6i_gateway = ort->rt6i_gateway;
1894 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1895 rt->rt6i_metric = 0;
1897 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1898 #ifdef CONFIG_IPV6_SUBTREES
1899 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1903 dst_release(dst_orig);
1904 return new ? new : ERR_PTR(-ENOMEM);
1908 * Destination cache support functions
1911 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1914 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1915 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1918 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1922 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1925 if (rt6_check_expired(rt))
1931 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1933 if (!__rt6_check_expired(rt) &&
1934 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1935 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1943 struct rt6_info *rt;
1945 rt = (struct rt6_info *) dst;
1947 /* All IPV6 dsts are created with ->obsolete set to the value
1948 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1949 * into this function always.
1952 rt6_dst_from_metrics_check(rt);
1954 if (rt->rt6i_flags & RTF_PCPU ||
1955 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1956 return rt6_dst_from_check(rt, cookie);
1958 return rt6_check(rt, cookie);
1961 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1963 struct rt6_info *rt = (struct rt6_info *) dst;
1966 if (rt->rt6i_flags & RTF_CACHE) {
1967 if (rt6_check_expired(rt)) {
1979 static void ip6_link_failure(struct sk_buff *skb)
1981 struct rt6_info *rt;
1983 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1985 rt = (struct rt6_info *) skb_dst(skb);
1987 if (rt->rt6i_flags & RTF_CACHE) {
1988 if (dst_hold_safe(&rt->dst))
1991 struct fib6_node *fn;
1994 fn = rcu_dereference(rt->rt6i_node);
1995 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2002 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2004 struct net *net = dev_net(rt->dst.dev);
2006 rt->rt6i_flags |= RTF_MODIFIED;
2007 rt->rt6i_pmtu = mtu;
2008 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2011 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2013 return !(rt->rt6i_flags & RTF_CACHE) &&
2014 (rt->rt6i_flags & RTF_PCPU ||
2015 rcu_access_pointer(rt->rt6i_node));
2018 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2019 const struct ipv6hdr *iph, u32 mtu)
2021 const struct in6_addr *daddr, *saddr;
2022 struct rt6_info *rt6 = (struct rt6_info *)dst;
2024 if (rt6->rt6i_flags & RTF_LOCAL)
2027 if (dst_metric_locked(dst, RTAX_MTU))
2031 daddr = &iph->daddr;
2032 saddr = &iph->saddr;
2034 daddr = &sk->sk_v6_daddr;
2035 saddr = &inet6_sk(sk)->saddr;
2040 dst_confirm_neigh(dst, daddr);
2041 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2042 if (mtu >= dst_mtu(dst))
2045 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2046 rt6_do_update_pmtu(rt6, mtu);
2047 /* update rt6_ex->stamp for cache */
2048 if (rt6->rt6i_flags & RTF_CACHE)
2049 rt6_update_exception_stamp_rt(rt6);
2051 struct rt6_info *nrt6;
2053 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2055 rt6_do_update_pmtu(nrt6, mtu);
2056 if (rt6_insert_exception(nrt6, rt6))
2057 dst_release_immediate(&nrt6->dst);
2062 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2063 struct sk_buff *skb, u32 mtu)
2065 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2069 int oif, u32 mark, kuid_t uid)
2071 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2072 struct dst_entry *dst;
2075 memset(&fl6, 0, sizeof(fl6));
2076 fl6.flowi6_oif = oif;
2077 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2078 fl6.daddr = iph->daddr;
2079 fl6.saddr = iph->saddr;
2080 fl6.flowlabel = ip6_flowinfo(iph);
2081 fl6.flowi6_uid = uid;
2083 dst = ip6_route_output(net, NULL, &fl6);
2085 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2092 struct dst_entry *dst;
2094 ip6_update_pmtu(skb, sock_net(sk), mtu,
2095 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2097 dst = __sk_dst_get(sk);
2098 if (!dst || !dst->obsolete ||
2099 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2103 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2104 ip6_datagram_dst_update(sk, false);
2107 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2109 /* Handle redirects */
2110 struct ip6rd_flowi {
2112 struct in6_addr gateway;
2115 static struct rt6_info *__ip6_route_redirect(struct net *net,
2116 struct fib6_table *table,
2120 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2121 struct rt6_info *rt, *rt_cache;
2122 struct fib6_node *fn;
2124 /* Get the "current" route for this destination and
2125 * check if the redirect has come from appropriate router.
2127 * RFC 4861 specifies that redirects should only be
2128 * accepted if they come from the nexthop to the target.
2129 * Due to the way the routes are chosen, this notion
2130 * is a bit fuzzy and one might need to check all possible
2135 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2137 for_each_fib6_node_rt_rcu(fn) {
2138 if (rt6_check_expired(rt))
2142 if (!(rt->rt6i_flags & RTF_GATEWAY))
2144 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2146 /* rt_cache's gateway might be different from its 'parent'
2147 * in the case of an ip redirect.
2148 * So we keep searching in the exception table if the gateway
2151 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2152 rt_cache = rt6_find_cached_rt(rt,
2156 ipv6_addr_equal(&rdfl->gateway,
2157 &rt_cache->rt6i_gateway)) {
2167 rt = net->ipv6.ip6_null_entry;
2168 else if (rt->dst.error) {
2169 rt = net->ipv6.ip6_null_entry;
2173 if (rt == net->ipv6.ip6_null_entry) {
2174 fn = fib6_backtrack(fn, &fl6->saddr);
2180 ip6_hold_safe(net, &rt, true);
2184 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2188 static struct dst_entry *ip6_route_redirect(struct net *net,
2189 const struct flowi6 *fl6,
2190 const struct in6_addr *gateway)
2192 int flags = RT6_LOOKUP_F_HAS_SADDR;
2193 struct ip6rd_flowi rdfl;
2196 rdfl.gateway = *gateway;
2198 return fib6_rule_lookup(net, &rdfl.fl6,
2199 flags, __ip6_route_redirect);
2202 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2205 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2206 struct dst_entry *dst;
2209 memset(&fl6, 0, sizeof(fl6));
2210 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2211 fl6.flowi6_oif = oif;
2212 fl6.flowi6_mark = mark;
2213 fl6.daddr = iph->daddr;
2214 fl6.saddr = iph->saddr;
2215 fl6.flowlabel = ip6_flowinfo(iph);
2216 fl6.flowi6_uid = uid;
2218 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2219 rt6_do_redirect(dst, NULL, skb);
2222 EXPORT_SYMBOL_GPL(ip6_redirect);
2224 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2227 const struct ipv6hdr *iph = ipv6_hdr(skb);
2228 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2229 struct dst_entry *dst;
2232 memset(&fl6, 0, sizeof(fl6));
2233 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2234 fl6.flowi6_oif = oif;
2235 fl6.flowi6_mark = mark;
2236 fl6.daddr = msg->dest;
2237 fl6.saddr = iph->daddr;
2238 fl6.flowi6_uid = sock_net_uid(net, NULL);
2240 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2241 rt6_do_redirect(dst, NULL, skb);
2245 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2247 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2250 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2252 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2254 struct net_device *dev = dst->dev;
2255 unsigned int mtu = dst_mtu(dst);
2256 struct net *net = dev_net(dev);
2258 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2260 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2261 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2264 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2265 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2266 * IPV6_MAXPLEN is also valid and means: "any MSS,
2267 * rely only on pmtu discovery"
2269 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2274 static unsigned int ip6_mtu(const struct dst_entry *dst)
2276 const struct rt6_info *rt = (const struct rt6_info *)dst;
2277 unsigned int mtu = rt->rt6i_pmtu;
2278 struct inet6_dev *idev;
2283 mtu = dst_metric_raw(dst, RTAX_MTU);
2290 idev = __in6_dev_get(dst->dev);
2292 mtu = idev->cnf.mtu6;
2296 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2298 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2301 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2304 struct dst_entry *dst;
2305 struct rt6_info *rt;
2306 struct inet6_dev *idev = in6_dev_get(dev);
2307 struct net *net = dev_net(dev);
2309 if (unlikely(!idev))
2310 return ERR_PTR(-ENODEV);
2312 rt = ip6_dst_alloc(net, dev, 0);
2313 if (unlikely(!rt)) {
2315 dst = ERR_PTR(-ENOMEM);
2319 rt->dst.flags |= DST_HOST;
2320 rt->dst.output = ip6_output;
2321 rt->rt6i_gateway = fl6->daddr;
2322 rt->rt6i_dst.addr = fl6->daddr;
2323 rt->rt6i_dst.plen = 128;
2324 rt->rt6i_idev = idev;
2325 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2327 /* Add this dst into uncached_list so that rt6_ifdown() can
2328 * do proper release of the net_device
2330 rt6_uncached_list_add(rt);
2331 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2333 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2339 static int ip6_dst_gc(struct dst_ops *ops)
2341 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2342 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2343 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2344 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2345 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2346 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2349 entries = dst_entries_get_fast(ops);
2350 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2351 entries <= rt_max_size)
2354 net->ipv6.ip6_rt_gc_expire++;
2355 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2356 entries = dst_entries_get_slow(ops);
2357 if (entries < ops->gc_thresh)
2358 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2360 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2361 return entries > rt_max_size;
2364 static int ip6_convert_metrics(struct mx6_config *mxc,
2365 const struct fib6_config *cfg)
2367 bool ecn_ca = false;
2375 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2379 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2380 int type = nla_type(nla);
2385 if (unlikely(type > RTAX_MAX))
2388 if (type == RTAX_CC_ALGO) {
2389 char tmp[TCP_CA_NAME_MAX];
2391 nla_strlcpy(tmp, nla, sizeof(tmp));
2392 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2393 if (val == TCP_CA_UNSPEC)
2396 val = nla_get_u32(nla);
2398 if (type == RTAX_HOPLIMIT && val > 255)
2400 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2404 __set_bit(type - 1, mxc->mx_valid);
2408 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2409 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2419 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2420 struct fib6_config *cfg,
2421 const struct in6_addr *gw_addr)
2423 struct flowi6 fl6 = {
2424 .flowi6_oif = cfg->fc_ifindex,
2426 .saddr = cfg->fc_prefsrc,
2428 struct fib6_table *table;
2429 struct rt6_info *rt;
2430 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2432 table = fib6_get_table(net, cfg->fc_table);
2436 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2437 flags |= RT6_LOOKUP_F_HAS_SADDR;
2439 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2441 /* if table lookup failed, fall back to full lookup */
2442 if (rt == net->ipv6.ip6_null_entry) {
2450 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2451 struct netlink_ext_ack *extack)
2453 struct net *net = cfg->fc_nlinfo.nl_net;
2454 struct rt6_info *rt = NULL;
2455 struct net_device *dev = NULL;
2456 struct inet6_dev *idev = NULL;
2457 struct fib6_table *table;
2461 /* RTF_PCPU is an internal flag; can not be set by userspace */
2462 if (cfg->fc_flags & RTF_PCPU) {
2463 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2467 if (cfg->fc_dst_len > 128) {
2468 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2471 if (cfg->fc_src_len > 128) {
2472 NL_SET_ERR_MSG(extack, "Invalid source address length");
2475 #ifndef CONFIG_IPV6_SUBTREES
2476 if (cfg->fc_src_len) {
2477 NL_SET_ERR_MSG(extack,
2478 "Specifying source address requires IPV6_SUBTREES to be enabled");
2482 if (cfg->fc_ifindex) {
2484 dev = dev_get_by_index(net, cfg->fc_ifindex);
2487 idev = in6_dev_get(dev);
2492 if (cfg->fc_metric == 0)
2493 cfg->fc_metric = IP6_RT_PRIO_USER;
2496 if (cfg->fc_nlinfo.nlh &&
2497 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2498 table = fib6_get_table(net, cfg->fc_table);
2500 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2501 table = fib6_new_table(net, cfg->fc_table);
2504 table = fib6_new_table(net, cfg->fc_table);
2510 rt = ip6_dst_alloc(net, NULL,
2511 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2518 if (cfg->fc_flags & RTF_EXPIRES)
2519 rt6_set_expires(rt, jiffies +
2520 clock_t_to_jiffies(cfg->fc_expires));
2522 rt6_clean_expires(rt);
2524 if (cfg->fc_protocol == RTPROT_UNSPEC)
2525 cfg->fc_protocol = RTPROT_BOOT;
2526 rt->rt6i_protocol = cfg->fc_protocol;
2528 addr_type = ipv6_addr_type(&cfg->fc_dst);
2530 if (addr_type & IPV6_ADDR_MULTICAST)
2531 rt->dst.input = ip6_mc_input;
2532 else if (cfg->fc_flags & RTF_LOCAL)
2533 rt->dst.input = ip6_input;
2535 rt->dst.input = ip6_forward;
2537 rt->dst.output = ip6_output;
2539 if (cfg->fc_encap) {
2540 struct lwtunnel_state *lwtstate;
2542 err = lwtunnel_build_state(cfg->fc_encap_type,
2543 cfg->fc_encap, AF_INET6, cfg,
2547 rt->dst.lwtstate = lwtstate_get(lwtstate);
2548 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2549 rt->dst.lwtstate->orig_output = rt->dst.output;
2550 rt->dst.output = lwtunnel_output;
2552 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2553 rt->dst.lwtstate->orig_input = rt->dst.input;
2554 rt->dst.input = lwtunnel_input;
2558 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2559 rt->rt6i_dst.plen = cfg->fc_dst_len;
2560 if (rt->rt6i_dst.plen == 128)
2561 rt->dst.flags |= DST_HOST;
2563 #ifdef CONFIG_IPV6_SUBTREES
2564 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2565 rt->rt6i_src.plen = cfg->fc_src_len;
2568 rt->rt6i_metric = cfg->fc_metric;
2570 /* We cannot add true routes via loopback here,
2571 they would result in kernel looping; promote them to reject routes
2573 if ((cfg->fc_flags & RTF_REJECT) ||
2574 (dev && (dev->flags & IFF_LOOPBACK) &&
2575 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2576 !(cfg->fc_flags & RTF_LOCAL))) {
2577 /* hold loopback dev/idev if we haven't done so. */
2578 if (dev != net->loopback_dev) {
2583 dev = net->loopback_dev;
2585 idev = in6_dev_get(dev);
2591 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2592 switch (cfg->fc_type) {
2594 rt->dst.error = -EINVAL;
2595 rt->dst.output = dst_discard_out;
2596 rt->dst.input = dst_discard;
2599 rt->dst.error = -EACCES;
2600 rt->dst.output = ip6_pkt_prohibit_out;
2601 rt->dst.input = ip6_pkt_prohibit;
2604 case RTN_UNREACHABLE:
2606 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2607 : (cfg->fc_type == RTN_UNREACHABLE)
2608 ? -EHOSTUNREACH : -ENETUNREACH;
2609 rt->dst.output = ip6_pkt_discard_out;
2610 rt->dst.input = ip6_pkt_discard;
2616 if (cfg->fc_flags & RTF_GATEWAY) {
2617 const struct in6_addr *gw_addr;
2620 gw_addr = &cfg->fc_gateway;
2621 gwa_type = ipv6_addr_type(gw_addr);
2623 /* if gw_addr is local we will fail to detect this in case
2624 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2625 * will return already-added prefix route via interface that
2626 * prefix route was assigned to, which might be non-loopback.
2629 if (ipv6_chk_addr_and_flags(net, gw_addr,
2630 gwa_type & IPV6_ADDR_LINKLOCAL ?
2631 dev : NULL, 0, 0)) {
2632 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2635 rt->rt6i_gateway = *gw_addr;
2637 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2638 struct rt6_info *grt = NULL;
2640 /* IPv6 strictly inhibits using not link-local
2641 addresses as nexthop address.
2642 Otherwise, router will not able to send redirects.
2643 It is very good, but in some (rare!) circumstances
2644 (SIT, PtP, NBMA NOARP links) it is handy to allow
2645 some exceptions. --ANK
2646 We allow IPv4-mapped nexthops to support RFC4798-type
2649 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2650 IPV6_ADDR_MAPPED))) {
2651 NL_SET_ERR_MSG(extack,
2652 "Invalid gateway address");
2656 if (cfg->fc_table) {
2657 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2660 if (grt->rt6i_flags & RTF_GATEWAY ||
2661 (dev && dev != grt->dst.dev)) {
2669 grt = rt6_lookup(net, gw_addr, NULL,
2670 cfg->fc_ifindex, 1);
2672 err = -EHOSTUNREACH;
2676 if (dev != grt->dst.dev) {
2682 idev = grt->rt6i_idev;
2684 in6_dev_hold(grt->rt6i_idev);
2686 if (!(grt->rt6i_flags & RTF_GATEWAY))
2695 NL_SET_ERR_MSG(extack, "Egress device not specified");
2697 } else if (dev->flags & IFF_LOOPBACK) {
2698 NL_SET_ERR_MSG(extack,
2699 "Egress device can not be loopback device for this route");
2708 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2709 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2710 NL_SET_ERR_MSG(extack, "Invalid source address");
2714 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2715 rt->rt6i_prefsrc.plen = 128;
2717 rt->rt6i_prefsrc.plen = 0;
2719 rt->rt6i_flags = cfg->fc_flags;
2723 rt->rt6i_idev = idev;
2724 rt->rt6i_table = table;
2726 cfg->fc_nlinfo.nl_net = dev_net(dev);
2735 dst_release_immediate(&rt->dst);
2737 return ERR_PTR(err);
2740 int ip6_route_add(struct fib6_config *cfg,
2741 struct netlink_ext_ack *extack)
2743 struct mx6_config mxc = { .mx = NULL, };
2744 struct rt6_info *rt;
2747 rt = ip6_route_info_create(cfg, extack);
2754 err = ip6_convert_metrics(&mxc, cfg);
2758 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2765 dst_release_immediate(&rt->dst);
2770 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2773 struct fib6_table *table;
2774 struct net *net = dev_net(rt->dst.dev);
2776 if (rt == net->ipv6.ip6_null_entry) {
2781 table = rt->rt6i_table;
2782 spin_lock_bh(&table->tb6_lock);
2783 err = fib6_del(rt, info);
2784 spin_unlock_bh(&table->tb6_lock);
2791 int ip6_del_rt(struct rt6_info *rt)
2793 struct nl_info info = {
2794 .nl_net = dev_net(rt->dst.dev),
2796 return __ip6_del_rt(rt, &info);
2799 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2801 struct nl_info *info = &cfg->fc_nlinfo;
2802 struct net *net = info->nl_net;
2803 struct sk_buff *skb = NULL;
2804 struct fib6_table *table;
2807 if (rt == net->ipv6.ip6_null_entry)
2809 table = rt->rt6i_table;
2810 spin_lock_bh(&table->tb6_lock);
2812 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2813 struct rt6_info *sibling, *next_sibling;
2815 /* prefer to send a single notification with all hops */
2816 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2818 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2820 if (rt6_fill_node(net, skb, rt,
2821 NULL, NULL, 0, RTM_DELROUTE,
2822 info->portid, seq, 0) < 0) {
2826 info->skip_notify = 1;
2829 list_for_each_entry_safe(sibling, next_sibling,
2832 err = fib6_del(sibling, info);
2838 err = fib6_del(rt, info);
2840 spin_unlock_bh(&table->tb6_lock);
2845 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2846 info->nlh, gfp_any());
2851 static int ip6_route_del(struct fib6_config *cfg,
2852 struct netlink_ext_ack *extack)
2854 struct rt6_info *rt, *rt_cache;
2855 struct fib6_table *table;
2856 struct fib6_node *fn;
2859 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2861 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2867 fn = fib6_locate(&table->tb6_root,
2868 &cfg->fc_dst, cfg->fc_dst_len,
2869 &cfg->fc_src, cfg->fc_src_len,
2870 !(cfg->fc_flags & RTF_CACHE));
2873 for_each_fib6_node_rt_rcu(fn) {
2874 if (cfg->fc_flags & RTF_CACHE) {
2875 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2881 if (cfg->fc_ifindex &&
2883 rt->dst.dev->ifindex != cfg->fc_ifindex))
2885 if (cfg->fc_flags & RTF_GATEWAY &&
2886 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2888 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2890 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2892 if (!dst_hold_safe(&rt->dst))
2896 /* if gateway was specified only delete the one hop */
2897 if (cfg->fc_flags & RTF_GATEWAY)
2898 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2900 return __ip6_del_rt_siblings(rt, cfg);
2908 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2910 struct netevent_redirect netevent;
2911 struct rt6_info *rt, *nrt = NULL;
2912 struct ndisc_options ndopts;
2913 struct inet6_dev *in6_dev;
2914 struct neighbour *neigh;
2916 int optlen, on_link;
2919 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2920 optlen -= sizeof(*msg);
2923 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2927 msg = (struct rd_msg *)icmp6_hdr(skb);
2929 if (ipv6_addr_is_multicast(&msg->dest)) {
2930 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2935 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2937 } else if (ipv6_addr_type(&msg->target) !=
2938 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2939 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2943 in6_dev = __in6_dev_get(skb->dev);
2946 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2950 * The IP source address of the Redirect MUST be the same as the current
2951 * first-hop router for the specified ICMP Destination Address.
2954 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2955 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2960 if (ndopts.nd_opts_tgt_lladdr) {
2961 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2964 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2969 rt = (struct rt6_info *) dst;
2970 if (rt->rt6i_flags & RTF_REJECT) {
2971 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2975 /* Redirect received -> path was valid.
2976 * Look, redirects are sent only in response to data packets,
2977 * so that this nexthop apparently is reachable. --ANK
2979 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2981 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2986 * We have finally decided to accept it.
2989 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2990 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2991 NEIGH_UPDATE_F_OVERRIDE|
2992 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2993 NEIGH_UPDATE_F_ISROUTER)),
2994 NDISC_REDIRECT, &ndopts);
2996 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3000 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3002 nrt->rt6i_flags &= ~RTF_GATEWAY;
3004 nrt->rt6i_protocol = RTPROT_REDIRECT;
3005 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3007 /* No need to remove rt from the exception table if rt is
3008 * a cached route because rt6_insert_exception() will
3011 if (rt6_insert_exception(nrt, rt)) {
3012 dst_release_immediate(&nrt->dst);
3016 netevent.old = &rt->dst;
3017 netevent.new = &nrt->dst;
3018 netevent.daddr = &msg->dest;
3019 netevent.neigh = neigh;
3020 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3023 neigh_release(neigh);
3027 * Misc support functions
3030 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3032 BUG_ON(from->dst.from);
3034 rt->rt6i_flags &= ~RTF_EXPIRES;
3035 dst_hold(&from->dst);
3036 rt->dst.from = &from->dst;
3037 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3040 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3042 rt->dst.input = ort->dst.input;
3043 rt->dst.output = ort->dst.output;
3044 rt->rt6i_dst = ort->rt6i_dst;
3045 rt->dst.error = ort->dst.error;
3046 rt->rt6i_idev = ort->rt6i_idev;
3048 in6_dev_hold(rt->rt6i_idev);
3049 rt->dst.lastuse = jiffies;
3050 rt->rt6i_gateway = ort->rt6i_gateway;
3051 rt->rt6i_flags = ort->rt6i_flags;
3052 rt6_set_from(rt, ort);
3053 rt->rt6i_metric = ort->rt6i_metric;
3054 #ifdef CONFIG_IPV6_SUBTREES
3055 rt->rt6i_src = ort->rt6i_src;
3057 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3058 rt->rt6i_table = ort->rt6i_table;
3059 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3062 #ifdef CONFIG_IPV6_ROUTE_INFO
3063 static struct rt6_info *rt6_get_route_info(struct net *net,
3064 const struct in6_addr *prefix, int prefixlen,
3065 const struct in6_addr *gwaddr,
3066 struct net_device *dev)
3068 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3069 int ifindex = dev->ifindex;
3070 struct fib6_node *fn;
3071 struct rt6_info *rt = NULL;
3072 struct fib6_table *table;
3074 table = fib6_get_table(net, tb_id);
3079 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3083 for_each_fib6_node_rt_rcu(fn) {
3084 if (rt->dst.dev->ifindex != ifindex)
3086 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3088 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3090 ip6_hold_safe(NULL, &rt, false);
3098 static struct rt6_info *rt6_add_route_info(struct net *net,
3099 const struct in6_addr *prefix, int prefixlen,
3100 const struct in6_addr *gwaddr,
3101 struct net_device *dev,
3104 struct fib6_config cfg = {
3105 .fc_metric = IP6_RT_PRIO_USER,
3106 .fc_ifindex = dev->ifindex,
3107 .fc_dst_len = prefixlen,
3108 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3109 RTF_UP | RTF_PREF(pref),
3110 .fc_protocol = RTPROT_RA,
3111 .fc_nlinfo.portid = 0,
3112 .fc_nlinfo.nlh = NULL,
3113 .fc_nlinfo.nl_net = net,
3116 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3117 cfg.fc_dst = *prefix;
3118 cfg.fc_gateway = *gwaddr;
3120 /* We should treat it as a default route if prefix length is 0. */
3122 cfg.fc_flags |= RTF_DEFAULT;
3124 ip6_route_add(&cfg, NULL);
3126 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3130 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3132 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3133 struct rt6_info *rt;
3134 struct fib6_table *table;
3136 table = fib6_get_table(dev_net(dev), tb_id);
3141 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3142 if (dev == rt->dst.dev &&
3143 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3144 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3148 ip6_hold_safe(NULL, &rt, false);
3153 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3154 struct net_device *dev,
3157 struct fib6_config cfg = {
3158 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3159 .fc_metric = IP6_RT_PRIO_USER,
3160 .fc_ifindex = dev->ifindex,
3161 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3162 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3163 .fc_protocol = RTPROT_RA,
3164 .fc_nlinfo.portid = 0,
3165 .fc_nlinfo.nlh = NULL,
3166 .fc_nlinfo.nl_net = dev_net(dev),
3169 cfg.fc_gateway = *gwaddr;
3171 if (!ip6_route_add(&cfg, NULL)) {
3172 struct fib6_table *table;
3174 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3176 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3179 return rt6_get_dflt_router(gwaddr, dev);
3182 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3184 struct rt6_info *rt;
3188 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3189 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3190 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3191 if (dst_hold_safe(&rt->dst)) {
3202 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3205 void rt6_purge_dflt_routers(struct net *net)
3207 struct fib6_table *table;
3208 struct hlist_head *head;
3213 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3214 head = &net->ipv6.fib_table_hash[h];
3215 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3216 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3217 __rt6_purge_dflt_routers(table);
3224 static void rtmsg_to_fib6_config(struct net *net,
3225 struct in6_rtmsg *rtmsg,
3226 struct fib6_config *cfg)
3228 memset(cfg, 0, sizeof(*cfg));
3230 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3232 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3233 cfg->fc_metric = rtmsg->rtmsg_metric;
3234 cfg->fc_expires = rtmsg->rtmsg_info;
3235 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3236 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3237 cfg->fc_flags = rtmsg->rtmsg_flags;
3239 cfg->fc_nlinfo.nl_net = net;
3241 cfg->fc_dst = rtmsg->rtmsg_dst;
3242 cfg->fc_src = rtmsg->rtmsg_src;
3243 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3246 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3248 struct fib6_config cfg;
3249 struct in6_rtmsg rtmsg;
3253 case SIOCADDRT: /* Add a route */
3254 case SIOCDELRT: /* Delete a route */
3255 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3257 err = copy_from_user(&rtmsg, arg,
3258 sizeof(struct in6_rtmsg));
3262 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3267 err = ip6_route_add(&cfg, NULL);
3270 err = ip6_route_del(&cfg, NULL);
3284 * Drop the packet on the floor
3287 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3290 struct dst_entry *dst = skb_dst(skb);
3291 switch (ipstats_mib_noroutes) {
3292 case IPSTATS_MIB_INNOROUTES:
3293 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3294 if (type == IPV6_ADDR_ANY) {
3295 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3296 IPSTATS_MIB_INADDRERRORS);
3300 case IPSTATS_MIB_OUTNOROUTES:
3301 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3302 ipstats_mib_noroutes);
3305 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3310 static int ip6_pkt_discard(struct sk_buff *skb)
3312 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3315 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3317 skb->dev = skb_dst(skb)->dev;
3318 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3321 static int ip6_pkt_prohibit(struct sk_buff *skb)
3323 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3326 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3328 skb->dev = skb_dst(skb)->dev;
3329 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3333 * Allocate a dst for local (unicast / anycast) address.
3336 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3337 const struct in6_addr *addr,
3341 struct net *net = dev_net(idev->dev);
3342 struct net_device *dev = idev->dev;
3343 struct rt6_info *rt;
3345 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3347 return ERR_PTR(-ENOMEM);
3351 rt->dst.flags |= DST_HOST;
3352 rt->dst.input = ip6_input;
3353 rt->dst.output = ip6_output;
3354 rt->rt6i_idev = idev;
3356 rt->rt6i_protocol = RTPROT_KERNEL;
3357 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3359 rt->rt6i_flags |= RTF_ANYCAST;
3361 rt->rt6i_flags |= RTF_LOCAL;
3363 rt->rt6i_gateway = *addr;
3364 rt->rt6i_dst.addr = *addr;
3365 rt->rt6i_dst.plen = 128;
3366 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3367 rt->rt6i_table = fib6_get_table(net, tb_id);
3372 /* remove deleted ip from prefsrc entries */
3373 struct arg_dev_net_ip {
3374 struct net_device *dev;
3376 struct in6_addr *addr;
3379 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3381 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3382 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3383 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3385 if (((void *)rt->dst.dev == dev || !dev) &&
3386 rt != net->ipv6.ip6_null_entry &&
3387 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3388 spin_lock_bh(&rt6_exception_lock);
3389 /* remove prefsrc entry */
3390 rt->rt6i_prefsrc.plen = 0;
3391 /* need to update cache as well */
3392 rt6_exceptions_remove_prefsrc(rt);
3393 spin_unlock_bh(&rt6_exception_lock);
3398 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3400 struct net *net = dev_net(ifp->idev->dev);
3401 struct arg_dev_net_ip adni = {
3402 .dev = ifp->idev->dev,
3406 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3409 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3411 /* Remove routers and update dst entries when gateway turn into host. */
3412 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3414 struct in6_addr *gateway = (struct in6_addr *)arg;
3416 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3417 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3421 /* Further clean up cached routes in exception table.
3422 * This is needed because cached route may have a different
3423 * gateway than its 'parent' in the case of an ip redirect.
3425 rt6_exceptions_clean_tohost(rt, gateway);
3430 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3432 fib6_clean_all(net, fib6_clean_tohost, gateway);
3435 struct arg_dev_net {
3436 struct net_device *dev;
3440 /* called with write lock held for table with rt */
3441 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3443 const struct arg_dev_net *adn = arg;
3444 const struct net_device *dev = adn->dev;
3446 if ((rt->dst.dev == dev || !dev) &&
3447 rt != adn->net->ipv6.ip6_null_entry &&
3448 (rt->rt6i_nsiblings == 0 ||
3449 (dev && netdev_unregistering(dev)) ||
3450 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3456 void rt6_ifdown(struct net *net, struct net_device *dev)
3458 struct arg_dev_net adn = {
3463 fib6_clean_all(net, fib6_ifdown, &adn);
3465 rt6_uncached_list_flush_dev(net, dev);
3468 struct rt6_mtu_change_arg {
3469 struct net_device *dev;
3473 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3475 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3476 struct inet6_dev *idev;
3478 /* In IPv6 pmtu discovery is not optional,
3479 so that RTAX_MTU lock cannot disable it.
3480 We still use this lock to block changes
3481 caused by addrconf/ndisc.
3484 idev = __in6_dev_get(arg->dev);
3488 /* For administrative MTU increase, there is no way to discover
3489 IPv6 PMTU increase, so PMTU increase should be updated here.
3490 Since RFC 1981 doesn't include administrative MTU increase
3491 update PMTU increase is a MUST. (i.e. jumbo frame)
3494 If new MTU is less than route PMTU, this new MTU will be the
3495 lowest MTU in the path, update the route PMTU to reflect PMTU
3496 decreases; if new MTU is greater than route PMTU, and the
3497 old MTU is the lowest MTU in the path, update the route PMTU
3498 to reflect the increase. In this case if the other nodes' MTU
3499 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3502 if (rt->dst.dev == arg->dev &&
3503 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3504 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3505 spin_lock_bh(&rt6_exception_lock);
3506 if (dst_mtu(&rt->dst) >= arg->mtu ||
3507 (dst_mtu(&rt->dst) < arg->mtu &&
3508 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3509 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3511 rt6_exceptions_update_pmtu(rt, arg->mtu);
3512 spin_unlock_bh(&rt6_exception_lock);
3517 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3519 struct rt6_mtu_change_arg arg = {
3524 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3527 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3528 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3529 [RTA_OIF] = { .type = NLA_U32 },
3530 [RTA_IIF] = { .type = NLA_U32 },
3531 [RTA_PRIORITY] = { .type = NLA_U32 },
3532 [RTA_METRICS] = { .type = NLA_NESTED },
3533 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3534 [RTA_PREF] = { .type = NLA_U8 },
3535 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3536 [RTA_ENCAP] = { .type = NLA_NESTED },
3537 [RTA_EXPIRES] = { .type = NLA_U32 },
3538 [RTA_UID] = { .type = NLA_U32 },
3539 [RTA_MARK] = { .type = NLA_U32 },
3542 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3543 struct fib6_config *cfg,
3544 struct netlink_ext_ack *extack)
3547 struct nlattr *tb[RTA_MAX+1];
3551 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3557 rtm = nlmsg_data(nlh);
3558 memset(cfg, 0, sizeof(*cfg));
3560 cfg->fc_table = rtm->rtm_table;
3561 cfg->fc_dst_len = rtm->rtm_dst_len;
3562 cfg->fc_src_len = rtm->rtm_src_len;
3563 cfg->fc_flags = RTF_UP;
3564 cfg->fc_protocol = rtm->rtm_protocol;
3565 cfg->fc_type = rtm->rtm_type;
3567 if (rtm->rtm_type == RTN_UNREACHABLE ||
3568 rtm->rtm_type == RTN_BLACKHOLE ||
3569 rtm->rtm_type == RTN_PROHIBIT ||
3570 rtm->rtm_type == RTN_THROW)
3571 cfg->fc_flags |= RTF_REJECT;
3573 if (rtm->rtm_type == RTN_LOCAL)
3574 cfg->fc_flags |= RTF_LOCAL;
3576 if (rtm->rtm_flags & RTM_F_CLONED)
3577 cfg->fc_flags |= RTF_CACHE;
3579 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3580 cfg->fc_nlinfo.nlh = nlh;
3581 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3583 if (tb[RTA_GATEWAY]) {
3584 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3585 cfg->fc_flags |= RTF_GATEWAY;
3589 int plen = (rtm->rtm_dst_len + 7) >> 3;
3591 if (nla_len(tb[RTA_DST]) < plen)
3594 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3598 int plen = (rtm->rtm_src_len + 7) >> 3;
3600 if (nla_len(tb[RTA_SRC]) < plen)
3603 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3606 if (tb[RTA_PREFSRC])
3607 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3610 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3612 if (tb[RTA_PRIORITY])
3613 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3615 if (tb[RTA_METRICS]) {
3616 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3617 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3621 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3623 if (tb[RTA_MULTIPATH]) {
3624 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3625 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3627 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3628 cfg->fc_mp_len, extack);
3634 pref = nla_get_u8(tb[RTA_PREF]);
3635 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3636 pref != ICMPV6_ROUTER_PREF_HIGH)
3637 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3638 cfg->fc_flags |= RTF_PREF(pref);
3642 cfg->fc_encap = tb[RTA_ENCAP];
3644 if (tb[RTA_ENCAP_TYPE]) {
3645 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3647 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3652 if (tb[RTA_EXPIRES]) {
3653 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3655 if (addrconf_finite_timeout(timeout)) {
3656 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3657 cfg->fc_flags |= RTF_EXPIRES;
3667 struct rt6_info *rt6_info;
3668 struct fib6_config r_cfg;
3669 struct mx6_config mxc;
3670 struct list_head next;
3673 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3677 list_for_each_entry(nh, rt6_nh_list, next) {
3678 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3679 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3680 nh->r_cfg.fc_ifindex);
3684 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3685 struct rt6_info *rt, struct fib6_config *r_cfg)
3690 list_for_each_entry(nh, rt6_nh_list, next) {
3691 /* check if rt6_info already exists */
3692 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3696 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3700 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3705 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3706 list_add_tail(&nh->next, rt6_nh_list);
3711 static void ip6_route_mpath_notify(struct rt6_info *rt,
3712 struct rt6_info *rt_last,
3713 struct nl_info *info,
3716 /* if this is an APPEND route, then rt points to the first route
3717 * inserted and rt_last points to last route inserted. Userspace
3718 * wants a consistent dump of the route which starts at the first
3719 * nexthop. Since sibling routes are always added at the end of
3720 * the list, find the first sibling of the last route appended
3722 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3723 rt = list_first_entry(&rt_last->rt6i_siblings,
3729 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3732 static int ip6_route_multipath_add(struct fib6_config *cfg,
3733 struct netlink_ext_ack *extack)
3735 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3736 struct nl_info *info = &cfg->fc_nlinfo;
3737 struct fib6_config r_cfg;
3738 struct rtnexthop *rtnh;
3739 struct rt6_info *rt;
3740 struct rt6_nh *err_nh;
3741 struct rt6_nh *nh, *nh_safe;
3747 int replace = (cfg->fc_nlinfo.nlh &&
3748 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3749 LIST_HEAD(rt6_nh_list);
3751 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3752 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3753 nlflags |= NLM_F_APPEND;
3755 remaining = cfg->fc_mp_len;
3756 rtnh = (struct rtnexthop *)cfg->fc_mp;
3758 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3759 * rt6_info structs per nexthop
3761 while (rtnh_ok(rtnh, remaining)) {
3762 memcpy(&r_cfg, cfg, sizeof(*cfg));
3763 if (rtnh->rtnh_ifindex)
3764 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3766 attrlen = rtnh_attrlen(rtnh);
3768 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3770 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3772 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3773 r_cfg.fc_flags |= RTF_GATEWAY;
3775 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3776 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3778 r_cfg.fc_encap_type = nla_get_u16(nla);
3781 rt = ip6_route_info_create(&r_cfg, extack);
3788 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3790 dst_release_immediate(&rt->dst);
3794 rtnh = rtnh_next(rtnh, &remaining);
3797 /* for add and replace send one notification with all nexthops.
3798 * Skip the notification in fib6_add_rt2node and send one with
3799 * the full route when done
3801 info->skip_notify = 1;
3804 list_for_each_entry(nh, &rt6_nh_list, next) {
3805 rt_last = nh->rt6_info;
3806 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3807 /* save reference to first route for notification */
3808 if (!rt_notif && !err)
3809 rt_notif = nh->rt6_info;
3811 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3812 nh->rt6_info = NULL;
3815 ip6_print_replace_route_err(&rt6_nh_list);
3820 /* Because each route is added like a single route we remove
3821 * these flags after the first nexthop: if there is a collision,
3822 * we have already failed to add the first nexthop:
3823 * fib6_add_rt2node() has rejected it; when replacing, old
3824 * nexthops have been replaced by first new, the rest should
3827 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3832 /* success ... tell user about new route */
3833 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3837 /* send notification for routes that were added so that
3838 * the delete notifications sent by ip6_route_del are
3842 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3844 /* Delete routes that were already added */
3845 list_for_each_entry(nh, &rt6_nh_list, next) {
3848 ip6_route_del(&nh->r_cfg, extack);
3852 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3854 dst_release_immediate(&nh->rt6_info->dst);
3856 list_del(&nh->next);
3863 static int ip6_route_multipath_del(struct fib6_config *cfg,
3864 struct netlink_ext_ack *extack)
3866 struct fib6_config r_cfg;
3867 struct rtnexthop *rtnh;
3870 int err = 1, last_err = 0;
3872 remaining = cfg->fc_mp_len;
3873 rtnh = (struct rtnexthop *)cfg->fc_mp;
3875 /* Parse a Multipath Entry */
3876 while (rtnh_ok(rtnh, remaining)) {
3877 memcpy(&r_cfg, cfg, sizeof(*cfg));
3878 if (rtnh->rtnh_ifindex)
3879 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3881 attrlen = rtnh_attrlen(rtnh);
3883 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3885 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3887 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3888 r_cfg.fc_flags |= RTF_GATEWAY;
3891 err = ip6_route_del(&r_cfg, extack);
3895 rtnh = rtnh_next(rtnh, &remaining);
3901 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3902 struct netlink_ext_ack *extack)
3904 struct fib6_config cfg;
3907 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3912 return ip6_route_multipath_del(&cfg, extack);
3914 cfg.fc_delete_all_nh = 1;
3915 return ip6_route_del(&cfg, extack);
3919 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3920 struct netlink_ext_ack *extack)
3922 struct fib6_config cfg;
3925 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3930 return ip6_route_multipath_add(&cfg, extack);
3932 return ip6_route_add(&cfg, extack);
3935 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3937 int nexthop_len = 0;
3939 if (rt->rt6i_nsiblings) {
3940 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3941 + NLA_ALIGN(sizeof(struct rtnexthop))
3942 + nla_total_size(16) /* RTA_GATEWAY */
3943 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3945 nexthop_len *= rt->rt6i_nsiblings;
3948 return NLMSG_ALIGN(sizeof(struct rtmsg))
3949 + nla_total_size(16) /* RTA_SRC */
3950 + nla_total_size(16) /* RTA_DST */
3951 + nla_total_size(16) /* RTA_GATEWAY */
3952 + nla_total_size(16) /* RTA_PREFSRC */
3953 + nla_total_size(4) /* RTA_TABLE */
3954 + nla_total_size(4) /* RTA_IIF */
3955 + nla_total_size(4) /* RTA_OIF */
3956 + nla_total_size(4) /* RTA_PRIORITY */
3957 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3958 + nla_total_size(sizeof(struct rta_cacheinfo))
3959 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3960 + nla_total_size(1) /* RTA_PREF */
3961 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3965 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3966 unsigned int *flags, bool skip_oif)
3968 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3969 *flags |= RTNH_F_LINKDOWN;
3970 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3971 *flags |= RTNH_F_DEAD;
3974 if (rt->rt6i_flags & RTF_GATEWAY) {
3975 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3976 goto nla_put_failure;
3979 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3980 *flags |= RTNH_F_OFFLOAD;
3982 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3983 if (!skip_oif && rt->dst.dev &&
3984 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3985 goto nla_put_failure;
3987 if (rt->dst.lwtstate &&
3988 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3989 goto nla_put_failure;
3997 /* add multipath next hop */
3998 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4000 struct rtnexthop *rtnh;
4001 unsigned int flags = 0;
4003 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4005 goto nla_put_failure;
4007 rtnh->rtnh_hops = 0;
4008 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4010 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4011 goto nla_put_failure;
4013 rtnh->rtnh_flags = flags;
4015 /* length of rtnetlink header + attributes */
4016 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4024 static int rt6_fill_node(struct net *net,
4025 struct sk_buff *skb, struct rt6_info *rt,
4026 struct in6_addr *dst, struct in6_addr *src,
4027 int iif, int type, u32 portid, u32 seq,
4030 u32 metrics[RTAX_MAX];
4032 struct nlmsghdr *nlh;
4036 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4040 rtm = nlmsg_data(nlh);
4041 rtm->rtm_family = AF_INET6;
4042 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4043 rtm->rtm_src_len = rt->rt6i_src.plen;
4046 table = rt->rt6i_table->tb6_id;
4048 table = RT6_TABLE_UNSPEC;
4049 rtm->rtm_table = table;
4050 if (nla_put_u32(skb, RTA_TABLE, table))
4051 goto nla_put_failure;
4052 if (rt->rt6i_flags & RTF_REJECT) {
4053 switch (rt->dst.error) {
4055 rtm->rtm_type = RTN_BLACKHOLE;
4058 rtm->rtm_type = RTN_PROHIBIT;
4061 rtm->rtm_type = RTN_THROW;
4064 rtm->rtm_type = RTN_UNREACHABLE;
4068 else if (rt->rt6i_flags & RTF_LOCAL)
4069 rtm->rtm_type = RTN_LOCAL;
4070 else if (rt->rt6i_flags & RTF_ANYCAST)
4071 rtm->rtm_type = RTN_ANYCAST;
4072 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4073 rtm->rtm_type = RTN_LOCAL;
4075 rtm->rtm_type = RTN_UNICAST;
4077 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4078 rtm->rtm_protocol = rt->rt6i_protocol;
4080 if (rt->rt6i_flags & RTF_CACHE)
4081 rtm->rtm_flags |= RTM_F_CLONED;
4084 if (nla_put_in6_addr(skb, RTA_DST, dst))
4085 goto nla_put_failure;
4086 rtm->rtm_dst_len = 128;
4087 } else if (rtm->rtm_dst_len)
4088 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4089 goto nla_put_failure;
4090 #ifdef CONFIG_IPV6_SUBTREES
4092 if (nla_put_in6_addr(skb, RTA_SRC, src))
4093 goto nla_put_failure;
4094 rtm->rtm_src_len = 128;
4095 } else if (rtm->rtm_src_len &&
4096 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4097 goto nla_put_failure;
4100 #ifdef CONFIG_IPV6_MROUTE
4101 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4102 int err = ip6mr_get_route(net, skb, rtm, portid);
4107 goto nla_put_failure;
4110 if (nla_put_u32(skb, RTA_IIF, iif))
4111 goto nla_put_failure;
4113 struct in6_addr saddr_buf;
4114 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4115 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4116 goto nla_put_failure;
4119 if (rt->rt6i_prefsrc.plen) {
4120 struct in6_addr saddr_buf;
4121 saddr_buf = rt->rt6i_prefsrc.addr;
4122 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4123 goto nla_put_failure;
4126 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4128 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4129 if (rtnetlink_put_metrics(skb, metrics) < 0)
4130 goto nla_put_failure;
4132 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4133 goto nla_put_failure;
4135 /* For multipath routes, walk the siblings list and add
4136 * each as a nexthop within RTA_MULTIPATH.
4138 if (rt->rt6i_nsiblings) {
4139 struct rt6_info *sibling, *next_sibling;
4142 mp = nla_nest_start(skb, RTA_MULTIPATH);
4144 goto nla_put_failure;
4146 if (rt6_add_nexthop(skb, rt) < 0)
4147 goto nla_put_failure;
4149 list_for_each_entry_safe(sibling, next_sibling,
4150 &rt->rt6i_siblings, rt6i_siblings) {
4151 if (rt6_add_nexthop(skb, sibling) < 0)
4152 goto nla_put_failure;
4155 nla_nest_end(skb, mp);
4157 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4158 goto nla_put_failure;
4161 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4163 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4164 goto nla_put_failure;
4166 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4167 goto nla_put_failure;
4170 nlmsg_end(skb, nlh);
4174 nlmsg_cancel(skb, nlh);
4178 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4180 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4181 struct net *net = arg->net;
4183 if (rt == net->ipv6.ip6_null_entry)
4186 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4187 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4189 /* user wants prefix routes only */
4190 if (rtm->rtm_flags & RTM_F_PREFIX &&
4191 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4192 /* success since this is not a prefix route */
4197 return rt6_fill_node(net,
4198 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4199 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4203 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4204 struct netlink_ext_ack *extack)
4206 struct net *net = sock_net(in_skb->sk);
4207 struct nlattr *tb[RTA_MAX+1];
4208 int err, iif = 0, oif = 0;
4209 struct dst_entry *dst;
4210 struct rt6_info *rt;
4211 struct sk_buff *skb;
4216 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4222 memset(&fl6, 0, sizeof(fl6));
4223 rtm = nlmsg_data(nlh);
4224 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4225 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4228 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4231 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4235 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4238 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4242 iif = nla_get_u32(tb[RTA_IIF]);
4245 oif = nla_get_u32(tb[RTA_OIF]);
4248 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4251 fl6.flowi6_uid = make_kuid(current_user_ns(),
4252 nla_get_u32(tb[RTA_UID]));
4254 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4257 struct net_device *dev;
4262 dev = dev_get_by_index_rcu(net, iif);
4269 fl6.flowi6_iif = iif;
4271 if (!ipv6_addr_any(&fl6.saddr))
4272 flags |= RT6_LOOKUP_F_HAS_SADDR;
4275 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4277 dst = ip6_route_lookup(net, &fl6, 0);
4281 fl6.flowi6_oif = oif;
4284 dst = ip6_route_output(net, NULL, &fl6);
4286 dst = ip6_route_lookup(net, &fl6, 0);
4290 rt = container_of(dst, struct rt6_info, dst);
4291 if (rt->dst.error) {
4292 err = rt->dst.error;
4297 if (rt == net->ipv6.ip6_null_entry) {
4298 err = rt->dst.error;
4303 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4310 skb_dst_set(skb, &rt->dst);
4312 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4313 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4316 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4317 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4324 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4329 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4330 unsigned int nlm_flags)
4332 struct sk_buff *skb;
4333 struct net *net = info->nl_net;
4338 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4340 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4344 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4345 event, info->portid, seq, nlm_flags);
4347 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4348 WARN_ON(err == -EMSGSIZE);
4352 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4353 info->nlh, gfp_any());
4357 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4360 static int ip6_route_dev_notify(struct notifier_block *this,
4361 unsigned long event, void *ptr)
4363 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4364 struct net *net = dev_net(dev);
4366 if (!(dev->flags & IFF_LOOPBACK))
4369 if (event == NETDEV_REGISTER) {
4370 net->ipv6.ip6_null_entry->dst.dev = dev;
4371 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4373 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4374 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4375 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4376 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4378 } else if (event == NETDEV_UNREGISTER &&
4379 dev->reg_state != NETREG_UNREGISTERED) {
4380 /* NETDEV_UNREGISTER could be fired for multiple times by
4381 * netdev_wait_allrefs(). Make sure we only call this once.
4383 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4384 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4385 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4386 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4397 #ifdef CONFIG_PROC_FS
4399 static const struct file_operations ipv6_route_proc_fops = {
4400 .owner = THIS_MODULE,
4401 .open = ipv6_route_open,
4403 .llseek = seq_lseek,
4404 .release = seq_release_net,
4407 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4409 struct net *net = (struct net *)seq->private;
4410 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4411 net->ipv6.rt6_stats->fib_nodes,
4412 net->ipv6.rt6_stats->fib_route_nodes,
4413 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4414 net->ipv6.rt6_stats->fib_rt_entries,
4415 net->ipv6.rt6_stats->fib_rt_cache,
4416 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4417 net->ipv6.rt6_stats->fib_discarded_routes);
4422 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4424 return single_open_net(inode, file, rt6_stats_seq_show);
4427 static const struct file_operations rt6_stats_seq_fops = {
4428 .owner = THIS_MODULE,
4429 .open = rt6_stats_seq_open,
4431 .llseek = seq_lseek,
4432 .release = single_release_net,
4434 #endif /* CONFIG_PROC_FS */
4436 #ifdef CONFIG_SYSCTL
4439 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4440 void __user *buffer, size_t *lenp, loff_t *ppos)
4447 net = (struct net *)ctl->extra1;
4448 delay = net->ipv6.sysctl.flush_delay;
4449 proc_dointvec(ctl, write, buffer, lenp, ppos);
4450 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4454 struct ctl_table ipv6_route_table_template[] = {
4456 .procname = "flush",
4457 .data = &init_net.ipv6.sysctl.flush_delay,
4458 .maxlen = sizeof(int),
4460 .proc_handler = ipv6_sysctl_rtcache_flush
4463 .procname = "gc_thresh",
4464 .data = &ip6_dst_ops_template.gc_thresh,
4465 .maxlen = sizeof(int),
4467 .proc_handler = proc_dointvec,
4470 .procname = "max_size",
4471 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4472 .maxlen = sizeof(int),
4474 .proc_handler = proc_dointvec,
4477 .procname = "gc_min_interval",
4478 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4479 .maxlen = sizeof(int),
4481 .proc_handler = proc_dointvec_jiffies,
4484 .procname = "gc_timeout",
4485 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4486 .maxlen = sizeof(int),
4488 .proc_handler = proc_dointvec_jiffies,
4491 .procname = "gc_interval",
4492 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4493 .maxlen = sizeof(int),
4495 .proc_handler = proc_dointvec_jiffies,
4498 .procname = "gc_elasticity",
4499 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4500 .maxlen = sizeof(int),
4502 .proc_handler = proc_dointvec,
4505 .procname = "mtu_expires",
4506 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4507 .maxlen = sizeof(int),
4509 .proc_handler = proc_dointvec_jiffies,
4512 .procname = "min_adv_mss",
4513 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4514 .maxlen = sizeof(int),
4516 .proc_handler = proc_dointvec,
4519 .procname = "gc_min_interval_ms",
4520 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4521 .maxlen = sizeof(int),
4523 .proc_handler = proc_dointvec_ms_jiffies,
4528 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4530 struct ctl_table *table;
4532 table = kmemdup(ipv6_route_table_template,
4533 sizeof(ipv6_route_table_template),
4537 table[0].data = &net->ipv6.sysctl.flush_delay;
4538 table[0].extra1 = net;
4539 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4540 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4541 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4542 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4543 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4544 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4545 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4546 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4547 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4549 /* Don't export sysctls to unprivileged users */
4550 if (net->user_ns != &init_user_ns)
4551 table[0].procname = NULL;
4558 static int __net_init ip6_route_net_init(struct net *net)
4562 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4563 sizeof(net->ipv6.ip6_dst_ops));
4565 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4566 goto out_ip6_dst_ops;
4568 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4569 sizeof(*net->ipv6.ip6_null_entry),
4571 if (!net->ipv6.ip6_null_entry)
4572 goto out_ip6_dst_entries;
4573 net->ipv6.ip6_null_entry->dst.path =
4574 (struct dst_entry *)net->ipv6.ip6_null_entry;
4575 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4576 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4577 ip6_template_metrics, true);
4579 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4580 net->ipv6.fib6_has_custom_rules = false;
4581 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4582 sizeof(*net->ipv6.ip6_prohibit_entry),
4584 if (!net->ipv6.ip6_prohibit_entry)
4585 goto out_ip6_null_entry;
4586 net->ipv6.ip6_prohibit_entry->dst.path =
4587 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4588 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4589 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4590 ip6_template_metrics, true);
4592 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4593 sizeof(*net->ipv6.ip6_blk_hole_entry),
4595 if (!net->ipv6.ip6_blk_hole_entry)
4596 goto out_ip6_prohibit_entry;
4597 net->ipv6.ip6_blk_hole_entry->dst.path =
4598 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4599 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4600 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4601 ip6_template_metrics, true);
4604 net->ipv6.sysctl.flush_delay = 0;
4605 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4606 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4607 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4608 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4609 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4610 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4611 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4613 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4620 out_ip6_prohibit_entry:
4621 kfree(net->ipv6.ip6_prohibit_entry);
4623 kfree(net->ipv6.ip6_null_entry);
4625 out_ip6_dst_entries:
4626 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4631 static void __net_exit ip6_route_net_exit(struct net *net)
4633 kfree(net->ipv6.ip6_null_entry);
4634 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4635 kfree(net->ipv6.ip6_prohibit_entry);
4636 kfree(net->ipv6.ip6_blk_hole_entry);
4638 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4641 static int __net_init ip6_route_net_init_late(struct net *net)
4643 #ifdef CONFIG_PROC_FS
4644 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4645 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4650 static void __net_exit ip6_route_net_exit_late(struct net *net)
4652 #ifdef CONFIG_PROC_FS
4653 remove_proc_entry("ipv6_route", net->proc_net);
4654 remove_proc_entry("rt6_stats", net->proc_net);
4658 static struct pernet_operations ip6_route_net_ops = {
4659 .init = ip6_route_net_init,
4660 .exit = ip6_route_net_exit,
4663 static int __net_init ipv6_inetpeer_init(struct net *net)
4665 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4669 inet_peer_base_init(bp);
4670 net->ipv6.peers = bp;
4674 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4676 struct inet_peer_base *bp = net->ipv6.peers;
4678 net->ipv6.peers = NULL;
4679 inetpeer_invalidate_tree(bp);
4683 static struct pernet_operations ipv6_inetpeer_ops = {
4684 .init = ipv6_inetpeer_init,
4685 .exit = ipv6_inetpeer_exit,
4688 static struct pernet_operations ip6_route_net_late_ops = {
4689 .init = ip6_route_net_init_late,
4690 .exit = ip6_route_net_exit_late,
4693 static struct notifier_block ip6_route_dev_notifier = {
4694 .notifier_call = ip6_route_dev_notify,
4695 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4698 void __init ip6_route_init_special_entries(void)
4700 /* Registering of the loopback is done before this portion of code,
4701 * the loopback reference in rt6_info will not be taken, do it
4702 * manually for init_net */
4703 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4704 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4705 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4706 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4707 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4708 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4709 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4713 int __init ip6_route_init(void)
4719 ip6_dst_ops_template.kmem_cachep =
4720 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4721 SLAB_HWCACHE_ALIGN, NULL);
4722 if (!ip6_dst_ops_template.kmem_cachep)
4725 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4727 goto out_kmem_cache;
4729 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4731 goto out_dst_entries;
4733 ret = register_pernet_subsys(&ip6_route_net_ops);
4735 goto out_register_inetpeer;
4737 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4741 goto out_register_subsys;
4747 ret = fib6_rules_init();
4751 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4753 goto fib6_rules_init;
4756 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4757 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4758 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4759 RTNL_FLAG_DOIT_UNLOCKED))
4760 goto out_register_late_subsys;
4762 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4764 goto out_register_late_subsys;
4766 for_each_possible_cpu(cpu) {
4767 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4769 INIT_LIST_HEAD(&ul->head);
4770 spin_lock_init(&ul->lock);
4776 out_register_late_subsys:
4777 unregister_pernet_subsys(&ip6_route_net_late_ops);
4779 fib6_rules_cleanup();
4784 out_register_subsys:
4785 unregister_pernet_subsys(&ip6_route_net_ops);
4786 out_register_inetpeer:
4787 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4789 dst_entries_destroy(&ip6_dst_blackhole_ops);
4791 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4795 void ip6_route_cleanup(void)
4797 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4798 unregister_pernet_subsys(&ip6_route_net_late_ops);
4799 fib6_rules_cleanup();
4802 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4803 unregister_pernet_subsys(&ip6_route_net_ops);
4804 dst_entries_destroy(&ip6_dst_blackhole_ops);
4805 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);