2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
160 if (dev == loopback_dev)
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(rt->dst.from);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
213 return &ipv6_hdr(skb)->daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
221 struct rt6_info *rt = (struct rt6_info *) dst;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
282 static struct dst_ops ip6_dst_blackhole_ops = {
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
300 .__refcnt = ATOMIC_INIT(1),
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
383 for_each_possible_cpu(cpu) {
386 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
387 /* no one shares rt */
391 dst_release_immediate(&rt->dst);
398 EXPORT_SYMBOL(ip6_dst_alloc);
400 static void ip6_dst_destroy(struct dst_entry *dst)
402 struct rt6_info *rt = (struct rt6_info *)dst;
403 struct rt6_exception_bucket *bucket;
404 struct dst_entry *from = dst->from;
405 struct inet6_dev *idev;
407 dst_destroy_metrics_generic(dst);
408 free_percpu(rt->rt6i_pcpu);
409 rt6_uncached_list_del(rt);
411 idev = rt->rt6i_idev;
413 rt->rt6i_idev = NULL;
416 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
418 rt->rt6i_exception_bucket = NULL;
426 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
429 struct rt6_info *rt = (struct rt6_info *)dst;
430 struct inet6_dev *idev = rt->rt6i_idev;
431 struct net_device *loopback_dev =
432 dev_net(dev)->loopback_dev;
434 if (idev && idev->dev != loopback_dev) {
435 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
437 rt->rt6i_idev = loopback_idev;
443 static bool __rt6_check_expired(const struct rt6_info *rt)
445 if (rt->rt6i_flags & RTF_EXPIRES)
446 return time_after(jiffies, rt->dst.expires);
451 static bool rt6_check_expired(const struct rt6_info *rt)
453 if (rt->rt6i_flags & RTF_EXPIRES) {
454 if (time_after(jiffies, rt->dst.expires))
456 } else if (rt->dst.from) {
457 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
458 rt6_check_expired((struct rt6_info *)rt->dst.from);
463 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
464 struct flowi6 *fl6, int oif,
467 struct rt6_info *sibling, *next_sibling;
470 /* We might have already computed the hash for ICMPv6 errors. In such
471 * case it will always be non-zero. Otherwise now is the time to do it.
474 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
476 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
477 /* Don't change the route, if route_choosen == 0
478 * (siblings does not include ourself)
481 list_for_each_entry_safe(sibling, next_sibling,
482 &match->rt6i_siblings, rt6i_siblings) {
484 if (route_choosen == 0) {
485 if (rt6_score_route(sibling, oif, strict) < 0)
495 * Route lookup. rcu_read_lock() should be held.
498 static inline struct rt6_info *rt6_device_match(struct net *net,
500 const struct in6_addr *saddr,
504 struct rt6_info *local = NULL;
505 struct rt6_info *sprt;
507 if (!oif && ipv6_addr_any(saddr))
510 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
511 struct net_device *dev = sprt->dst.dev;
514 if (dev->ifindex == oif)
516 if (dev->flags & IFF_LOOPBACK) {
517 if (!sprt->rt6i_idev ||
518 sprt->rt6i_idev->dev->ifindex != oif) {
519 if (flags & RT6_LOOKUP_F_IFACE)
522 local->rt6i_idev->dev->ifindex == oif)
528 if (ipv6_chk_addr(net, saddr, dev,
529 flags & RT6_LOOKUP_F_IFACE))
538 if (flags & RT6_LOOKUP_F_IFACE)
539 return net->ipv6.ip6_null_entry;
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547 struct work_struct work;
548 struct in6_addr target;
549 struct net_device *dev;
552 static void rt6_probe_deferred(struct work_struct *w)
554 struct in6_addr mcaddr;
555 struct __rt6_probe_work *work =
556 container_of(w, struct __rt6_probe_work, work);
558 addrconf_addr_solict_mult(&work->target, &mcaddr);
559 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
564 static void rt6_probe(struct rt6_info *rt)
566 struct __rt6_probe_work *work;
567 struct neighbour *neigh;
569 * Okay, this does not seem to be appropriate
570 * for now, however, we need to check if it
571 * is really so; aka Router Reachability Probing.
573 * Router Reachability Probe MUST be rate-limited
574 * to no more than one per minute.
576 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
579 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
581 if (neigh->nud_state & NUD_VALID)
585 write_lock(&neigh->lock);
586 if (!(neigh->nud_state & NUD_VALID) &&
589 rt->rt6i_idev->cnf.rtr_probe_interval)) {
590 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592 __neigh_set_probe_once(neigh);
594 write_unlock(&neigh->lock);
596 work = kmalloc(sizeof(*work), GFP_ATOMIC);
600 INIT_WORK(&work->work, rt6_probe_deferred);
601 work->target = rt->rt6i_gateway;
602 dev_hold(rt->dst.dev);
603 work->dev = rt->dst.dev;
604 schedule_work(&work->work);
608 rcu_read_unlock_bh();
611 static inline void rt6_probe(struct rt6_info *rt)
617 * Default Router Selection (RFC 2461 6.3.6)
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
621 struct net_device *dev = rt->dst.dev;
622 if (!oif || dev->ifindex == oif)
624 if ((dev->flags & IFF_LOOPBACK) &&
625 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
632 struct neighbour *neigh;
633 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
635 if (rt->rt6i_flags & RTF_NONEXTHOP ||
636 !(rt->rt6i_flags & RTF_GATEWAY))
637 return RT6_NUD_SUCCEED;
640 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
642 read_lock(&neigh->lock);
643 if (neigh->nud_state & NUD_VALID)
644 ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646 else if (!(neigh->nud_state & NUD_FAILED))
647 ret = RT6_NUD_SUCCEED;
649 ret = RT6_NUD_FAIL_PROBE;
651 read_unlock(&neigh->lock);
653 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
656 rcu_read_unlock_bh();
661 static int rt6_score_route(struct rt6_info *rt, int oif,
666 m = rt6_check_dev(rt, oif);
667 if (!m && (strict & RT6_LOOKUP_F_IFACE))
668 return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
672 if (strict & RT6_LOOKUP_F_REACHABLE) {
673 int n = rt6_check_neigh(rt);
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681 int *mpri, struct rt6_info *match,
685 bool match_do_rr = false;
686 struct inet6_dev *idev = rt->rt6i_idev;
687 struct net_device *dev = rt->dst.dev;
689 if (dev && !netif_carrier_ok(dev) &&
690 idev->cnf.ignore_routes_with_linkdown &&
691 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
694 if (rt6_check_expired(rt))
697 m = rt6_score_route(rt, oif, strict);
698 if (m == RT6_NUD_FAIL_DO_RR) {
700 m = 0; /* lowest valid score */
701 } else if (m == RT6_NUD_FAIL_HARD) {
705 if (strict & RT6_LOOKUP_F_REACHABLE)
708 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
710 *do_rr = match_do_rr;
718 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
719 struct rt6_info *leaf,
720 struct rt6_info *rr_head,
721 u32 metric, int oif, int strict,
724 struct rt6_info *rt, *match, *cont;
729 for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
730 if (rt->rt6i_metric != metric) {
735 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 for (rt = leaf; rt && rt != rr_head;
739 rt = rcu_dereference(rt->dst.rt6_next)) {
740 if (rt->rt6i_metric != metric) {
745 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
752 match = find_match(rt, oif, strict, &mpri, match, do_rr);
757 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
760 struct rt6_info *leaf = rcu_dereference(fn->leaf);
761 struct rt6_info *match, *rt0;
766 return net->ipv6.ip6_null_entry;
768 rt0 = rcu_dereference(fn->rr_ptr);
772 /* Double check to make sure fn is not an intermediate node
773 * and fn->leaf does not points to its child's leaf
774 * (This might happen if all routes under fn are deleted from
775 * the tree and fib6_repair_tree() is called on the node.)
777 key_plen = rt0->rt6i_dst.plen;
778 #ifdef CONFIG_IPV6_SUBTREES
779 if (rt0->rt6i_src.plen)
780 key_plen = rt0->rt6i_src.plen;
782 if (fn->fn_bit != key_plen)
783 return net->ipv6.ip6_null_entry;
785 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
789 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
791 /* no entries matched; do round-robin */
792 if (!next || next->rt6i_metric != rt0->rt6i_metric)
796 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
797 /* make sure next is not being deleted from the tree */
799 rcu_assign_pointer(fn->rr_ptr, next);
800 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
804 return match ? match : net->ipv6.ip6_null_entry;
807 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
809 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
812 #ifdef CONFIG_IPV6_ROUTE_INFO
813 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
814 const struct in6_addr *gwaddr)
816 struct net *net = dev_net(dev);
817 struct route_info *rinfo = (struct route_info *) opt;
818 struct in6_addr prefix_buf, *prefix;
820 unsigned long lifetime;
823 if (len < sizeof(struct route_info)) {
827 /* Sanity check for prefix_len and length */
828 if (rinfo->length > 3) {
830 } else if (rinfo->prefix_len > 128) {
832 } else if (rinfo->prefix_len > 64) {
833 if (rinfo->length < 2) {
836 } else if (rinfo->prefix_len > 0) {
837 if (rinfo->length < 1) {
842 pref = rinfo->route_pref;
843 if (pref == ICMPV6_ROUTER_PREF_INVALID)
846 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
848 if (rinfo->length == 3)
849 prefix = (struct in6_addr *)rinfo->prefix;
851 /* this function is safe */
852 ipv6_addr_prefix(&prefix_buf,
853 (struct in6_addr *)rinfo->prefix,
855 prefix = &prefix_buf;
858 if (rinfo->prefix_len == 0)
859 rt = rt6_get_dflt_router(gwaddr, dev);
861 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
864 if (rt && !lifetime) {
870 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
873 rt->rt6i_flags = RTF_ROUTEINFO |
874 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
877 if (!addrconf_finite_timeout(lifetime))
878 rt6_clean_expires(rt);
880 rt6_set_expires(rt, jiffies + HZ * lifetime);
888 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
889 struct in6_addr *saddr)
891 struct fib6_node *pn, *sn;
893 if (fn->fn_flags & RTN_TL_ROOT)
895 pn = rcu_dereference(fn->parent);
896 sn = FIB6_SUBTREE(pn);
898 fn = fib6_lookup(sn, NULL, saddr);
901 if (fn->fn_flags & RTN_RTINFO)
906 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
909 struct rt6_info *rt = *prt;
911 if (dst_hold_safe(&rt->dst))
914 rt = net->ipv6.ip6_null_entry;
923 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
924 struct fib6_table *table,
925 struct flowi6 *fl6, int flags)
927 struct rt6_info *rt, *rt_cache;
928 struct fib6_node *fn;
931 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
933 rt = rcu_dereference(fn->leaf);
935 rt = net->ipv6.ip6_null_entry;
937 rt = rt6_device_match(net, rt, &fl6->saddr,
938 fl6->flowi6_oif, flags);
939 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
940 rt = rt6_multipath_select(rt, fl6,
941 fl6->flowi6_oif, flags);
943 if (rt == net->ipv6.ip6_null_entry) {
944 fn = fib6_backtrack(fn, &fl6->saddr);
948 /* Search through exception table */
949 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
953 if (ip6_hold_safe(net, &rt, true))
954 dst_use_noref(&rt->dst, jiffies);
958 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
964 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
967 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
969 EXPORT_SYMBOL_GPL(ip6_route_lookup);
971 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
972 const struct in6_addr *saddr, int oif, int strict)
974 struct flowi6 fl6 = {
978 struct dst_entry *dst;
979 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
982 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 flags |= RT6_LOOKUP_F_HAS_SADDR;
986 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
988 return (struct rt6_info *) dst;
994 EXPORT_SYMBOL(rt6_lookup);
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997 * It takes new route entry, the addition fails by any reason the
999 * Caller must hold dst before calling it.
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 struct mx6_config *mxc,
1004 struct netlink_ext_ack *extack)
1007 struct fib6_table *table;
1009 table = rt->rt6i_table;
1010 spin_lock_bh(&table->tb6_lock);
1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 spin_unlock_bh(&table->tb6_lock);
1017 int ip6_ins_rt(struct rt6_info *rt)
1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020 struct mx6_config mxc = { .mx = NULL, };
1022 /* Hold dst to account for the reference from the fib6 tree */
1024 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1030 struct net_device *dev = rt->dst.dev;
1032 if (rt->rt6i_flags & RTF_LOCAL) {
1033 /* for copies of local routes, dst->dev needs to be the
1034 * device if it is a master device, the master device if
1035 * device is enslaved, and the loopback as the default
1037 if (netif_is_l3_slave(dev) &&
1038 !rt6_need_strict(&rt->rt6i_dst.addr))
1039 dev = l3mdev_master_dev_rcu(dev);
1040 else if (!netif_is_l3_master(dev))
1041 dev = dev_net(dev)->loopback_dev;
1042 /* last case is netif_is_l3_master(dev) is true in which
1043 * case we want dev returned to be dev
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 const struct in6_addr *daddr,
1052 const struct in6_addr *saddr)
1054 struct net_device *dev;
1055 struct rt6_info *rt;
1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 ort = (struct rt6_info *)ort->dst.from;
1065 dev = ip6_rt_get_dev_rcu(ort);
1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1071 ip6_rt_copy_init(rt, ort);
1072 rt->rt6i_flags |= RTF_CACHE;
1073 rt->rt6i_metric = 0;
1074 rt->dst.flags |= DST_HOST;
1075 rt->rt6i_dst.addr = *daddr;
1076 rt->rt6i_dst.plen = 128;
1078 if (!rt6_is_gw_or_nonexthop(ort)) {
1079 if (ort->rt6i_dst.plen != 128 &&
1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 if (rt->rt6i_src.plen && saddr) {
1084 rt->rt6i_src.addr = *saddr;
1085 rt->rt6i_src.plen = 128;
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1095 struct net_device *dev;
1096 struct rt6_info *pcpu_rt;
1099 dev = ip6_rt_get_dev_rcu(rt);
1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1104 ip6_rt_copy_init(pcpu_rt, rt);
1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 pcpu_rt->rt6i_flags |= RTF_PCPU;
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1113 struct rt6_info *pcpu_rt, **p;
1115 p = this_cpu_ptr(rt->rt6i_pcpu);
1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 rt6_dst_from_metrics_check(pcpu_rt);
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1126 struct rt6_info *pcpu_rt, *prev, **p;
1128 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1130 struct net *net = dev_net(rt->dst.dev);
1132 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 return net->ipv6.ip6_null_entry;
1136 dst_hold(&pcpu_rt->dst);
1137 p = this_cpu_ptr(rt->rt6i_pcpu);
1138 prev = cmpxchg(p, NULL, pcpu_rt);
1140 /* If someone did it before us, return prev instead */
1141 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1142 dst_release_immediate(&pcpu_rt->dst);
1143 /* release refcnt taken by above dst_hold() */
1144 dst_release_immediate(&pcpu_rt->dst);
1145 dst_hold(&prev->dst);
1149 rt6_dst_from_metrics_check(pcpu_rt);
1153 /* exception hash table implementation
1155 static DEFINE_SPINLOCK(rt6_exception_lock);
1157 /* Remove rt6_ex from hash table and free the memory
1158 * Caller must hold rt6_exception_lock
1160 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1161 struct rt6_exception *rt6_ex)
1163 struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1165 if (!bucket || !rt6_ex)
1167 rt6_ex->rt6i->rt6i_node = NULL;
1168 hlist_del_rcu(&rt6_ex->hlist);
1169 rt6_release(rt6_ex->rt6i);
1170 kfree_rcu(rt6_ex, rcu);
1171 WARN_ON_ONCE(!bucket->depth);
1173 net->ipv6.rt6_stats->fib_rt_cache--;
1176 /* Remove oldest rt6_ex in bucket and free the memory
1177 * Caller must hold rt6_exception_lock
1179 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1181 struct rt6_exception *rt6_ex, *oldest = NULL;
1186 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1187 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1190 rt6_remove_exception(bucket, oldest);
1193 static u32 rt6_exception_hash(const struct in6_addr *dst,
1194 const struct in6_addr *src)
1196 static u32 seed __read_mostly;
1199 net_get_random_once(&seed, sizeof(seed));
1200 val = jhash(dst, sizeof(*dst), seed);
1202 #ifdef CONFIG_IPV6_SUBTREES
1204 val = jhash(src, sizeof(*src), val);
1206 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1209 /* Helper function to find the cached rt in the hash table
1210 * and update bucket pointer to point to the bucket for this
1211 * (daddr, saddr) pair
1212 * Caller must hold rt6_exception_lock
1214 static struct rt6_exception *
1215 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1216 const struct in6_addr *daddr,
1217 const struct in6_addr *saddr)
1219 struct rt6_exception *rt6_ex;
1222 if (!(*bucket) || !daddr)
1225 hval = rt6_exception_hash(daddr, saddr);
1228 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1229 struct rt6_info *rt6 = rt6_ex->rt6i;
1230 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1232 #ifdef CONFIG_IPV6_SUBTREES
1233 if (matched && saddr)
1234 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1242 /* Helper function to find the cached rt in the hash table
1243 * and update bucket pointer to point to the bucket for this
1244 * (daddr, saddr) pair
1245 * Caller must hold rcu_read_lock()
1247 static struct rt6_exception *
1248 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1249 const struct in6_addr *daddr,
1250 const struct in6_addr *saddr)
1252 struct rt6_exception *rt6_ex;
1255 WARN_ON_ONCE(!rcu_read_lock_held());
1257 if (!(*bucket) || !daddr)
1260 hval = rt6_exception_hash(daddr, saddr);
1263 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1264 struct rt6_info *rt6 = rt6_ex->rt6i;
1265 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1267 #ifdef CONFIG_IPV6_SUBTREES
1268 if (matched && saddr)
1269 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1277 static int rt6_insert_exception(struct rt6_info *nrt,
1278 struct rt6_info *ort)
1280 struct net *net = dev_net(ort->dst.dev);
1281 struct rt6_exception_bucket *bucket;
1282 struct in6_addr *src_key = NULL;
1283 struct rt6_exception *rt6_ex;
1286 /* ort can't be a cache or pcpu route */
1287 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1288 ort = (struct rt6_info *)ort->dst.from;
1289 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1291 spin_lock_bh(&rt6_exception_lock);
1293 if (ort->exception_bucket_flushed) {
1298 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1299 lockdep_is_held(&rt6_exception_lock));
1301 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1307 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1310 #ifdef CONFIG_IPV6_SUBTREES
1311 /* rt6i_src.plen != 0 indicates ort is in subtree
1312 * and exception table is indexed by a hash of
1313 * both rt6i_dst and rt6i_src.
1314 * Otherwise, the exception table is indexed by
1315 * a hash of only rt6i_dst.
1317 if (ort->rt6i_src.plen)
1318 src_key = &nrt->rt6i_src.addr;
1321 /* Update rt6i_prefsrc as it could be changed
1322 * in rt6_remove_prefsrc()
1324 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1325 /* rt6_mtu_change() might lower mtu on ort.
1326 * Only insert this exception route if its mtu
1327 * is less than ort's mtu value.
1329 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1334 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1337 rt6_remove_exception(bucket, rt6_ex);
1339 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1345 rt6_ex->stamp = jiffies;
1346 atomic_inc(&nrt->rt6i_ref);
1347 nrt->rt6i_node = ort->rt6i_node;
1348 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1350 net->ipv6.rt6_stats->fib_rt_cache++;
1352 if (bucket->depth > FIB6_MAX_DEPTH)
1353 rt6_exception_remove_oldest(bucket);
1356 spin_unlock_bh(&rt6_exception_lock);
1358 /* Update fn->fn_sernum to invalidate all cached dst */
1360 fib6_update_sernum(ort);
1365 void rt6_flush_exceptions(struct rt6_info *rt)
1367 struct rt6_exception_bucket *bucket;
1368 struct rt6_exception *rt6_ex;
1369 struct hlist_node *tmp;
1372 spin_lock_bh(&rt6_exception_lock);
1373 /* Prevent rt6_insert_exception() to recreate the bucket list */
1374 rt->exception_bucket_flushed = 1;
1376 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1377 lockdep_is_held(&rt6_exception_lock));
1381 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1382 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1383 rt6_remove_exception(bucket, rt6_ex);
1384 WARN_ON_ONCE(bucket->depth);
1389 spin_unlock_bh(&rt6_exception_lock);
1392 /* Find cached rt in the hash table inside passed in rt
1393 * Caller has to hold rcu_read_lock()
1395 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1396 struct in6_addr *daddr,
1397 struct in6_addr *saddr)
1399 struct rt6_exception_bucket *bucket;
1400 struct in6_addr *src_key = NULL;
1401 struct rt6_exception *rt6_ex;
1402 struct rt6_info *res = NULL;
1404 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1406 #ifdef CONFIG_IPV6_SUBTREES
1407 /* rt6i_src.plen != 0 indicates rt is in subtree
1408 * and exception table is indexed by a hash of
1409 * both rt6i_dst and rt6i_src.
1410 * Otherwise, the exception table is indexed by
1411 * a hash of only rt6i_dst.
1413 if (rt->rt6i_src.plen)
1416 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1418 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1424 /* Remove the passed in cached rt from the hash table that contains it */
1425 int rt6_remove_exception_rt(struct rt6_info *rt)
1427 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1428 struct rt6_exception_bucket *bucket;
1429 struct in6_addr *src_key = NULL;
1430 struct rt6_exception *rt6_ex;
1434 !(rt->rt6i_flags | RTF_CACHE))
1437 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1440 spin_lock_bh(&rt6_exception_lock);
1441 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1442 lockdep_is_held(&rt6_exception_lock));
1443 #ifdef CONFIG_IPV6_SUBTREES
1444 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1445 * and exception table is indexed by a hash of
1446 * both rt6i_dst and rt6i_src.
1447 * Otherwise, the exception table is indexed by
1448 * a hash of only rt6i_dst.
1450 if (from->rt6i_src.plen)
1451 src_key = &rt->rt6i_src.addr;
1453 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1457 rt6_remove_exception(bucket, rt6_ex);
1463 spin_unlock_bh(&rt6_exception_lock);
1467 /* Find rt6_ex which contains the passed in rt cache and
1470 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1472 struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1473 struct rt6_exception_bucket *bucket;
1474 struct in6_addr *src_key = NULL;
1475 struct rt6_exception *rt6_ex;
1478 !(rt->rt6i_flags | RTF_CACHE))
1482 bucket = rcu_dereference(from->rt6i_exception_bucket);
1484 #ifdef CONFIG_IPV6_SUBTREES
1485 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1486 * and exception table is indexed by a hash of
1487 * both rt6i_dst and rt6i_src.
1488 * Otherwise, the exception table is indexed by
1489 * a hash of only rt6i_dst.
1491 if (from->rt6i_src.plen)
1492 src_key = &rt->rt6i_src.addr;
1494 rt6_ex = __rt6_find_exception_rcu(&bucket,
1498 rt6_ex->stamp = jiffies;
1503 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1505 struct rt6_exception_bucket *bucket;
1506 struct rt6_exception *rt6_ex;
1509 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1510 lockdep_is_held(&rt6_exception_lock));
1513 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1515 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1522 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1524 struct rt6_exception_bucket *bucket;
1525 struct rt6_exception *rt6_ex;
1528 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1529 lockdep_is_held(&rt6_exception_lock));
1532 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1533 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1534 struct rt6_info *entry = rt6_ex->rt6i;
1535 /* For RTF_CACHE with rt6i_pmtu == 0
1536 * (i.e. a redirected route),
1537 * the metrics of its rt->dst.from has already
1540 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1541 entry->rt6i_pmtu = mtu;
1548 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1550 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1551 struct in6_addr *gateway)
1553 struct rt6_exception_bucket *bucket;
1554 struct rt6_exception *rt6_ex;
1555 struct hlist_node *tmp;
1558 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1561 spin_lock_bh(&rt6_exception_lock);
1562 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1563 lockdep_is_held(&rt6_exception_lock));
1566 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1567 hlist_for_each_entry_safe(rt6_ex, tmp,
1568 &bucket->chain, hlist) {
1569 struct rt6_info *entry = rt6_ex->rt6i;
1571 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1572 RTF_CACHE_GATEWAY &&
1573 ipv6_addr_equal(gateway,
1574 &entry->rt6i_gateway)) {
1575 rt6_remove_exception(bucket, rt6_ex);
1582 spin_unlock_bh(&rt6_exception_lock);
1585 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1586 struct rt6_exception *rt6_ex,
1587 struct fib6_gc_args *gc_args,
1590 struct rt6_info *rt = rt6_ex->rt6i;
1592 if (atomic_read(&rt->dst.__refcnt) == 1 &&
1593 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1594 RT6_TRACE("aging clone %p\n", rt);
1595 rt6_remove_exception(bucket, rt6_ex);
1597 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1598 struct neighbour *neigh;
1599 __u8 neigh_flags = 0;
1601 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1603 neigh_flags = neigh->flags;
1604 neigh_release(neigh);
1606 if (!(neigh_flags & NTF_ROUTER)) {
1607 RT6_TRACE("purging route %p via non-router but gateway\n",
1609 rt6_remove_exception(bucket, rt6_ex);
1616 void rt6_age_exceptions(struct rt6_info *rt,
1617 struct fib6_gc_args *gc_args,
1620 struct rt6_exception_bucket *bucket;
1621 struct rt6_exception *rt6_ex;
1622 struct hlist_node *tmp;
1625 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1628 spin_lock_bh(&rt6_exception_lock);
1629 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1630 lockdep_is_held(&rt6_exception_lock));
1633 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1634 hlist_for_each_entry_safe(rt6_ex, tmp,
1635 &bucket->chain, hlist) {
1636 rt6_age_examine_exception(bucket, rt6_ex,
1642 spin_unlock_bh(&rt6_exception_lock);
1645 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1646 int oif, struct flowi6 *fl6, int flags)
1648 struct fib6_node *fn, *saved_fn;
1649 struct rt6_info *rt, *rt_cache;
1652 strict |= flags & RT6_LOOKUP_F_IFACE;
1653 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1654 if (net->ipv6.devconf_all->forwarding == 0)
1655 strict |= RT6_LOOKUP_F_REACHABLE;
1659 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1662 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1666 rt = rt6_select(net, fn, oif, strict);
1667 if (rt->rt6i_nsiblings)
1668 rt = rt6_multipath_select(rt, fl6, oif, strict);
1669 if (rt == net->ipv6.ip6_null_entry) {
1670 fn = fib6_backtrack(fn, &fl6->saddr);
1672 goto redo_rt6_select;
1673 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1674 /* also consider unreachable route */
1675 strict &= ~RT6_LOOKUP_F_REACHABLE;
1677 goto redo_rt6_select;
1681 /*Search through exception table */
1682 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1686 if (rt == net->ipv6.ip6_null_entry) {
1689 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1691 } else if (rt->rt6i_flags & RTF_CACHE) {
1692 if (ip6_hold_safe(net, &rt, true)) {
1693 dst_use_noref(&rt->dst, jiffies);
1694 rt6_dst_from_metrics_check(rt);
1697 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1699 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1700 !(rt->rt6i_flags & RTF_GATEWAY))) {
1701 /* Create a RTF_CACHE clone which will not be
1702 * owned by the fib6 tree. It is for the special case where
1703 * the daddr in the skb during the neighbor look-up is different
1704 * from the fl6->daddr used to look-up route here.
1707 struct rt6_info *uncached_rt;
1709 if (ip6_hold_safe(net, &rt, true)) {
1710 dst_use_noref(&rt->dst, jiffies);
1714 goto uncached_rt_out;
1718 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1719 dst_release(&rt->dst);
1722 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1723 * No need for another dst_hold()
1725 rt6_uncached_list_add(uncached_rt);
1726 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1728 uncached_rt = net->ipv6.ip6_null_entry;
1729 dst_hold(&uncached_rt->dst);
1733 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1737 /* Get a percpu copy */
1739 struct rt6_info *pcpu_rt;
1741 dst_use_noref(&rt->dst, jiffies);
1742 pcpu_rt = rt6_get_pcpu_route(rt);
1747 /* atomic_inc_not_zero() is needed when using rcu */
1748 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1749 /* We have to do the read_unlock first
1750 * because rt6_make_pcpu_route() may trigger
1751 * ip6_dst_gc() which will take the write_lock.
1753 * No dst_hold() on rt is needed because grabbing
1754 * rt->rt6i_ref makes sure rt can't be released.
1757 pcpu_rt = rt6_make_pcpu_route(rt);
1760 /* rt is already removed from tree */
1762 pcpu_rt = net->ipv6.ip6_null_entry;
1763 dst_hold(&pcpu_rt->dst);
1767 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1771 EXPORT_SYMBOL_GPL(ip6_pol_route);
1773 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1774 struct flowi6 *fl6, int flags)
1776 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1779 struct dst_entry *ip6_route_input_lookup(struct net *net,
1780 struct net_device *dev,
1781 struct flowi6 *fl6, int flags)
1783 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1784 flags |= RT6_LOOKUP_F_IFACE;
1786 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1788 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1790 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1791 struct flow_keys *keys)
1793 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1794 const struct ipv6hdr *key_iph = outer_iph;
1795 const struct ipv6hdr *inner_iph;
1796 const struct icmp6hdr *icmph;
1797 struct ipv6hdr _inner_iph;
1799 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1802 icmph = icmp6_hdr(skb);
1803 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1804 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1805 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1806 icmph->icmp6_type != ICMPV6_PARAMPROB)
1809 inner_iph = skb_header_pointer(skb,
1810 skb_transport_offset(skb) + sizeof(*icmph),
1811 sizeof(_inner_iph), &_inner_iph);
1815 key_iph = inner_iph;
1817 memset(keys, 0, sizeof(*keys));
1818 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1819 keys->addrs.v6addrs.src = key_iph->saddr;
1820 keys->addrs.v6addrs.dst = key_iph->daddr;
1821 keys->tags.flow_label = ip6_flowinfo(key_iph);
1822 keys->basic.ip_proto = key_iph->nexthdr;
1825 /* if skb is set it will be used and fl6 can be NULL */
1826 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1828 struct flow_keys hash_keys;
1831 ip6_multipath_l3_keys(skb, &hash_keys);
1832 return flow_hash_from_keys(&hash_keys);
1835 return get_hash_from_flowi6(fl6);
1838 void ip6_route_input(struct sk_buff *skb)
1840 const struct ipv6hdr *iph = ipv6_hdr(skb);
1841 struct net *net = dev_net(skb->dev);
1842 int flags = RT6_LOOKUP_F_HAS_SADDR;
1843 struct ip_tunnel_info *tun_info;
1844 struct flowi6 fl6 = {
1845 .flowi6_iif = skb->dev->ifindex,
1846 .daddr = iph->daddr,
1847 .saddr = iph->saddr,
1848 .flowlabel = ip6_flowinfo(iph),
1849 .flowi6_mark = skb->mark,
1850 .flowi6_proto = iph->nexthdr,
1853 tun_info = skb_tunnel_info(skb);
1854 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1855 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1856 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1857 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1859 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1862 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1863 struct flowi6 *fl6, int flags)
1865 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1868 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1869 struct flowi6 *fl6, int flags)
1873 if (rt6_need_strict(&fl6->daddr)) {
1874 struct dst_entry *dst;
1876 dst = l3mdev_link_scope_lookup(net, fl6);
1881 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1883 any_src = ipv6_addr_any(&fl6->saddr);
1884 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1885 (fl6->flowi6_oif && any_src))
1886 flags |= RT6_LOOKUP_F_IFACE;
1889 flags |= RT6_LOOKUP_F_HAS_SADDR;
1891 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1893 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1895 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1897 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1899 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1900 struct net_device *loopback_dev = net->loopback_dev;
1901 struct dst_entry *new = NULL;
1903 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1904 DST_OBSOLETE_NONE, 0);
1907 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1911 new->input = dst_discard;
1912 new->output = dst_discard_out;
1914 dst_copy_metrics(new, &ort->dst);
1916 rt->rt6i_idev = in6_dev_get(loopback_dev);
1917 rt->rt6i_gateway = ort->rt6i_gateway;
1918 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1919 rt->rt6i_metric = 0;
1921 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1922 #ifdef CONFIG_IPV6_SUBTREES
1923 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1927 dst_release(dst_orig);
1928 return new ? new : ERR_PTR(-ENOMEM);
1932 * Destination cache support functions
1935 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1938 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1939 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1942 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1946 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1949 if (rt6_check_expired(rt))
1955 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1957 if (!__rt6_check_expired(rt) &&
1958 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1959 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1965 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1967 struct rt6_info *rt;
1969 rt = (struct rt6_info *) dst;
1971 /* All IPV6 dsts are created with ->obsolete set to the value
1972 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1973 * into this function always.
1976 rt6_dst_from_metrics_check(rt);
1978 if (rt->rt6i_flags & RTF_PCPU ||
1979 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1980 return rt6_dst_from_check(rt, cookie);
1982 return rt6_check(rt, cookie);
1985 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1987 struct rt6_info *rt = (struct rt6_info *) dst;
1990 if (rt->rt6i_flags & RTF_CACHE) {
1991 if (rt6_check_expired(rt)) {
2003 static void ip6_link_failure(struct sk_buff *skb)
2005 struct rt6_info *rt;
2007 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2009 rt = (struct rt6_info *) skb_dst(skb);
2011 if (rt->rt6i_flags & RTF_CACHE) {
2012 if (dst_hold_safe(&rt->dst))
2015 struct fib6_node *fn;
2018 fn = rcu_dereference(rt->rt6i_node);
2019 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2026 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2028 struct net *net = dev_net(rt->dst.dev);
2030 rt->rt6i_flags |= RTF_MODIFIED;
2031 rt->rt6i_pmtu = mtu;
2032 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2035 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2037 return !(rt->rt6i_flags & RTF_CACHE) &&
2038 (rt->rt6i_flags & RTF_PCPU ||
2039 rcu_access_pointer(rt->rt6i_node));
2042 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2043 const struct ipv6hdr *iph, u32 mtu)
2045 const struct in6_addr *daddr, *saddr;
2046 struct rt6_info *rt6 = (struct rt6_info *)dst;
2048 if (rt6->rt6i_flags & RTF_LOCAL)
2051 if (dst_metric_locked(dst, RTAX_MTU))
2055 daddr = &iph->daddr;
2056 saddr = &iph->saddr;
2058 daddr = &sk->sk_v6_daddr;
2059 saddr = &inet6_sk(sk)->saddr;
2064 dst_confirm_neigh(dst, daddr);
2065 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2066 if (mtu >= dst_mtu(dst))
2069 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2070 rt6_do_update_pmtu(rt6, mtu);
2071 /* update rt6_ex->stamp for cache */
2072 if (rt6->rt6i_flags & RTF_CACHE)
2073 rt6_update_exception_stamp_rt(rt6);
2075 struct rt6_info *nrt6;
2077 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2079 rt6_do_update_pmtu(nrt6, mtu);
2080 if (rt6_insert_exception(nrt6, rt6))
2081 dst_release_immediate(&nrt6->dst);
2086 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2087 struct sk_buff *skb, u32 mtu)
2089 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2092 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2093 int oif, u32 mark, kuid_t uid)
2095 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2096 struct dst_entry *dst;
2099 memset(&fl6, 0, sizeof(fl6));
2100 fl6.flowi6_oif = oif;
2101 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2102 fl6.daddr = iph->daddr;
2103 fl6.saddr = iph->saddr;
2104 fl6.flowlabel = ip6_flowinfo(iph);
2105 fl6.flowi6_uid = uid;
2107 dst = ip6_route_output(net, NULL, &fl6);
2109 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2112 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2114 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2116 struct dst_entry *dst;
2118 ip6_update_pmtu(skb, sock_net(sk), mtu,
2119 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2121 dst = __sk_dst_get(sk);
2122 if (!dst || !dst->obsolete ||
2123 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2127 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2128 ip6_datagram_dst_update(sk, false);
2131 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2133 /* Handle redirects */
2134 struct ip6rd_flowi {
2136 struct in6_addr gateway;
2139 static struct rt6_info *__ip6_route_redirect(struct net *net,
2140 struct fib6_table *table,
2144 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2145 struct rt6_info *rt, *rt_cache;
2146 struct fib6_node *fn;
2148 /* Get the "current" route for this destination and
2149 * check if the redirect has come from appropriate router.
2151 * RFC 4861 specifies that redirects should only be
2152 * accepted if they come from the nexthop to the target.
2153 * Due to the way the routes are chosen, this notion
2154 * is a bit fuzzy and one might need to check all possible
2159 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2161 for_each_fib6_node_rt_rcu(fn) {
2162 if (rt6_check_expired(rt))
2166 if (!(rt->rt6i_flags & RTF_GATEWAY))
2168 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2170 /* rt_cache's gateway might be different from its 'parent'
2171 * in the case of an ip redirect.
2172 * So we keep searching in the exception table if the gateway
2175 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176 rt_cache = rt6_find_cached_rt(rt,
2180 ipv6_addr_equal(&rdfl->gateway,
2181 &rt_cache->rt6i_gateway)) {
2191 rt = net->ipv6.ip6_null_entry;
2192 else if (rt->dst.error) {
2193 rt = net->ipv6.ip6_null_entry;
2197 if (rt == net->ipv6.ip6_null_entry) {
2198 fn = fib6_backtrack(fn, &fl6->saddr);
2204 ip6_hold_safe(net, &rt, true);
2208 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213 const struct flowi6 *fl6,
2214 const struct in6_addr *gateway)
2216 int flags = RT6_LOOKUP_F_HAS_SADDR;
2217 struct ip6rd_flowi rdfl;
2220 rdfl.gateway = *gateway;
2222 return fib6_rule_lookup(net, &rdfl.fl6,
2223 flags, __ip6_route_redirect);
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2229 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230 struct dst_entry *dst;
2233 memset(&fl6, 0, sizeof(fl6));
2234 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235 fl6.flowi6_oif = oif;
2236 fl6.flowi6_mark = mark;
2237 fl6.daddr = iph->daddr;
2238 fl6.saddr = iph->saddr;
2239 fl6.flowlabel = ip6_flowinfo(iph);
2240 fl6.flowi6_uid = uid;
2242 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243 rt6_do_redirect(dst, NULL, skb);
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2251 const struct ipv6hdr *iph = ipv6_hdr(skb);
2252 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253 struct dst_entry *dst;
2256 memset(&fl6, 0, sizeof(fl6));
2257 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258 fl6.flowi6_oif = oif;
2259 fl6.flowi6_mark = mark;
2260 fl6.daddr = msg->dest;
2261 fl6.saddr = iph->daddr;
2262 fl6.flowi6_uid = sock_net_uid(net, NULL);
2264 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265 rt6_do_redirect(dst, NULL, skb);
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2271 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2278 struct net_device *dev = dst->dev;
2279 unsigned int mtu = dst_mtu(dst);
2280 struct net *net = dev_net(dev);
2282 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2284 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2288 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290 * IPV6_MAXPLEN is also valid and means: "any MSS,
2291 * rely only on pmtu discovery"
2293 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2300 const struct rt6_info *rt = (const struct rt6_info *)dst;
2301 unsigned int mtu = rt->rt6i_pmtu;
2302 struct inet6_dev *idev;
2307 mtu = dst_metric_raw(dst, RTAX_MTU);
2314 idev = __in6_dev_get(dst->dev);
2316 mtu = idev->cnf.mtu6;
2320 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2322 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2328 struct dst_entry *dst;
2329 struct rt6_info *rt;
2330 struct inet6_dev *idev = in6_dev_get(dev);
2331 struct net *net = dev_net(dev);
2333 if (unlikely(!idev))
2334 return ERR_PTR(-ENODEV);
2336 rt = ip6_dst_alloc(net, dev, 0);
2337 if (unlikely(!rt)) {
2339 dst = ERR_PTR(-ENOMEM);
2343 rt->dst.flags |= DST_HOST;
2344 rt->dst.output = ip6_output;
2345 rt->rt6i_gateway = fl6->daddr;
2346 rt->rt6i_dst.addr = fl6->daddr;
2347 rt->rt6i_dst.plen = 128;
2348 rt->rt6i_idev = idev;
2349 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2351 /* Add this dst into uncached_list so that rt6_ifdown() can
2352 * do proper release of the net_device
2354 rt6_uncached_list_add(rt);
2355 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2357 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2363 static int ip6_dst_gc(struct dst_ops *ops)
2365 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2366 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2367 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2368 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2369 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2370 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2373 entries = dst_entries_get_fast(ops);
2374 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2375 entries <= rt_max_size)
2378 net->ipv6.ip6_rt_gc_expire++;
2379 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2380 entries = dst_entries_get_slow(ops);
2381 if (entries < ops->gc_thresh)
2382 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2384 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2385 return entries > rt_max_size;
2388 static int ip6_convert_metrics(struct mx6_config *mxc,
2389 const struct fib6_config *cfg)
2391 bool ecn_ca = false;
2399 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2403 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2404 int type = nla_type(nla);
2409 if (unlikely(type > RTAX_MAX))
2412 if (type == RTAX_CC_ALGO) {
2413 char tmp[TCP_CA_NAME_MAX];
2415 nla_strlcpy(tmp, nla, sizeof(tmp));
2416 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2417 if (val == TCP_CA_UNSPEC)
2420 val = nla_get_u32(nla);
2422 if (type == RTAX_HOPLIMIT && val > 255)
2424 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2428 __set_bit(type - 1, mxc->mx_valid);
2432 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2433 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2443 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2444 struct fib6_config *cfg,
2445 const struct in6_addr *gw_addr)
2447 struct flowi6 fl6 = {
2448 .flowi6_oif = cfg->fc_ifindex,
2450 .saddr = cfg->fc_prefsrc,
2452 struct fib6_table *table;
2453 struct rt6_info *rt;
2454 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2456 table = fib6_get_table(net, cfg->fc_table);
2460 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2461 flags |= RT6_LOOKUP_F_HAS_SADDR;
2463 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2465 /* if table lookup failed, fall back to full lookup */
2466 if (rt == net->ipv6.ip6_null_entry) {
2474 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2475 struct netlink_ext_ack *extack)
2477 struct net *net = cfg->fc_nlinfo.nl_net;
2478 struct rt6_info *rt = NULL;
2479 struct net_device *dev = NULL;
2480 struct inet6_dev *idev = NULL;
2481 struct fib6_table *table;
2485 /* RTF_PCPU is an internal flag; can not be set by userspace */
2486 if (cfg->fc_flags & RTF_PCPU) {
2487 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2491 if (cfg->fc_dst_len > 128) {
2492 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2495 if (cfg->fc_src_len > 128) {
2496 NL_SET_ERR_MSG(extack, "Invalid source address length");
2499 #ifndef CONFIG_IPV6_SUBTREES
2500 if (cfg->fc_src_len) {
2501 NL_SET_ERR_MSG(extack,
2502 "Specifying source address requires IPV6_SUBTREES to be enabled");
2506 if (cfg->fc_ifindex) {
2508 dev = dev_get_by_index(net, cfg->fc_ifindex);
2511 idev = in6_dev_get(dev);
2516 if (cfg->fc_metric == 0)
2517 cfg->fc_metric = IP6_RT_PRIO_USER;
2520 if (cfg->fc_nlinfo.nlh &&
2521 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2522 table = fib6_get_table(net, cfg->fc_table);
2524 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2525 table = fib6_new_table(net, cfg->fc_table);
2528 table = fib6_new_table(net, cfg->fc_table);
2534 rt = ip6_dst_alloc(net, NULL,
2535 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2542 if (cfg->fc_flags & RTF_EXPIRES)
2543 rt6_set_expires(rt, jiffies +
2544 clock_t_to_jiffies(cfg->fc_expires));
2546 rt6_clean_expires(rt);
2548 if (cfg->fc_protocol == RTPROT_UNSPEC)
2549 cfg->fc_protocol = RTPROT_BOOT;
2550 rt->rt6i_protocol = cfg->fc_protocol;
2552 addr_type = ipv6_addr_type(&cfg->fc_dst);
2554 if (addr_type & IPV6_ADDR_MULTICAST)
2555 rt->dst.input = ip6_mc_input;
2556 else if (cfg->fc_flags & RTF_LOCAL)
2557 rt->dst.input = ip6_input;
2559 rt->dst.input = ip6_forward;
2561 rt->dst.output = ip6_output;
2563 if (cfg->fc_encap) {
2564 struct lwtunnel_state *lwtstate;
2566 err = lwtunnel_build_state(cfg->fc_encap_type,
2567 cfg->fc_encap, AF_INET6, cfg,
2571 rt->dst.lwtstate = lwtstate_get(lwtstate);
2572 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2573 rt->dst.lwtstate->orig_output = rt->dst.output;
2574 rt->dst.output = lwtunnel_output;
2576 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2577 rt->dst.lwtstate->orig_input = rt->dst.input;
2578 rt->dst.input = lwtunnel_input;
2582 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2583 rt->rt6i_dst.plen = cfg->fc_dst_len;
2584 if (rt->rt6i_dst.plen == 128)
2585 rt->dst.flags |= DST_HOST;
2587 #ifdef CONFIG_IPV6_SUBTREES
2588 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2589 rt->rt6i_src.plen = cfg->fc_src_len;
2592 rt->rt6i_metric = cfg->fc_metric;
2594 /* We cannot add true routes via loopback here,
2595 they would result in kernel looping; promote them to reject routes
2597 if ((cfg->fc_flags & RTF_REJECT) ||
2598 (dev && (dev->flags & IFF_LOOPBACK) &&
2599 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2600 !(cfg->fc_flags & RTF_LOCAL))) {
2601 /* hold loopback dev/idev if we haven't done so. */
2602 if (dev != net->loopback_dev) {
2607 dev = net->loopback_dev;
2609 idev = in6_dev_get(dev);
2615 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2616 switch (cfg->fc_type) {
2618 rt->dst.error = -EINVAL;
2619 rt->dst.output = dst_discard_out;
2620 rt->dst.input = dst_discard;
2623 rt->dst.error = -EACCES;
2624 rt->dst.output = ip6_pkt_prohibit_out;
2625 rt->dst.input = ip6_pkt_prohibit;
2628 case RTN_UNREACHABLE:
2630 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2631 : (cfg->fc_type == RTN_UNREACHABLE)
2632 ? -EHOSTUNREACH : -ENETUNREACH;
2633 rt->dst.output = ip6_pkt_discard_out;
2634 rt->dst.input = ip6_pkt_discard;
2640 if (cfg->fc_flags & RTF_GATEWAY) {
2641 const struct in6_addr *gw_addr;
2644 gw_addr = &cfg->fc_gateway;
2645 gwa_type = ipv6_addr_type(gw_addr);
2647 /* if gw_addr is local we will fail to detect this in case
2648 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2649 * will return already-added prefix route via interface that
2650 * prefix route was assigned to, which might be non-loopback.
2653 if (ipv6_chk_addr_and_flags(net, gw_addr,
2654 gwa_type & IPV6_ADDR_LINKLOCAL ?
2655 dev : NULL, 0, 0)) {
2656 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2659 rt->rt6i_gateway = *gw_addr;
2661 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2662 struct rt6_info *grt = NULL;
2664 /* IPv6 strictly inhibits using not link-local
2665 addresses as nexthop address.
2666 Otherwise, router will not able to send redirects.
2667 It is very good, but in some (rare!) circumstances
2668 (SIT, PtP, NBMA NOARP links) it is handy to allow
2669 some exceptions. --ANK
2670 We allow IPv4-mapped nexthops to support RFC4798-type
2673 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2674 IPV6_ADDR_MAPPED))) {
2675 NL_SET_ERR_MSG(extack,
2676 "Invalid gateway address");
2680 if (cfg->fc_table) {
2681 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2684 if (grt->rt6i_flags & RTF_GATEWAY ||
2685 (dev && dev != grt->dst.dev)) {
2693 grt = rt6_lookup(net, gw_addr, NULL,
2694 cfg->fc_ifindex, 1);
2696 err = -EHOSTUNREACH;
2700 if (dev != grt->dst.dev) {
2706 idev = grt->rt6i_idev;
2708 in6_dev_hold(grt->rt6i_idev);
2710 if (!(grt->rt6i_flags & RTF_GATEWAY))
2719 NL_SET_ERR_MSG(extack, "Egress device not specified");
2721 } else if (dev->flags & IFF_LOOPBACK) {
2722 NL_SET_ERR_MSG(extack,
2723 "Egress device can not be loopback device for this route");
2732 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2733 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2734 NL_SET_ERR_MSG(extack, "Invalid source address");
2738 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2739 rt->rt6i_prefsrc.plen = 128;
2741 rt->rt6i_prefsrc.plen = 0;
2743 rt->rt6i_flags = cfg->fc_flags;
2747 rt->rt6i_idev = idev;
2748 rt->rt6i_table = table;
2750 cfg->fc_nlinfo.nl_net = dev_net(dev);
2759 dst_release_immediate(&rt->dst);
2761 return ERR_PTR(err);
2764 int ip6_route_add(struct fib6_config *cfg,
2765 struct netlink_ext_ack *extack)
2767 struct mx6_config mxc = { .mx = NULL, };
2768 struct rt6_info *rt;
2771 rt = ip6_route_info_create(cfg, extack);
2778 err = ip6_convert_metrics(&mxc, cfg);
2782 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2789 dst_release_immediate(&rt->dst);
2794 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2797 struct fib6_table *table;
2798 struct net *net = dev_net(rt->dst.dev);
2800 if (rt == net->ipv6.ip6_null_entry) {
2805 table = rt->rt6i_table;
2806 spin_lock_bh(&table->tb6_lock);
2807 err = fib6_del(rt, info);
2808 spin_unlock_bh(&table->tb6_lock);
2815 int ip6_del_rt(struct rt6_info *rt)
2817 struct nl_info info = {
2818 .nl_net = dev_net(rt->dst.dev),
2820 return __ip6_del_rt(rt, &info);
2823 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2825 struct nl_info *info = &cfg->fc_nlinfo;
2826 struct net *net = info->nl_net;
2827 struct sk_buff *skb = NULL;
2828 struct fib6_table *table;
2831 if (rt == net->ipv6.ip6_null_entry)
2833 table = rt->rt6i_table;
2834 spin_lock_bh(&table->tb6_lock);
2836 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2837 struct rt6_info *sibling, *next_sibling;
2839 /* prefer to send a single notification with all hops */
2840 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2842 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2844 if (rt6_fill_node(net, skb, rt,
2845 NULL, NULL, 0, RTM_DELROUTE,
2846 info->portid, seq, 0) < 0) {
2850 info->skip_notify = 1;
2853 list_for_each_entry_safe(sibling, next_sibling,
2856 err = fib6_del(sibling, info);
2862 err = fib6_del(rt, info);
2864 spin_unlock_bh(&table->tb6_lock);
2869 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2870 info->nlh, gfp_any());
2875 static int ip6_route_del(struct fib6_config *cfg,
2876 struct netlink_ext_ack *extack)
2878 struct rt6_info *rt, *rt_cache;
2879 struct fib6_table *table;
2880 struct fib6_node *fn;
2883 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2885 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2891 fn = fib6_locate(&table->tb6_root,
2892 &cfg->fc_dst, cfg->fc_dst_len,
2893 &cfg->fc_src, cfg->fc_src_len,
2894 !(cfg->fc_flags & RTF_CACHE));
2897 for_each_fib6_node_rt_rcu(fn) {
2898 if (cfg->fc_flags & RTF_CACHE) {
2899 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2905 if (cfg->fc_ifindex &&
2907 rt->dst.dev->ifindex != cfg->fc_ifindex))
2909 if (cfg->fc_flags & RTF_GATEWAY &&
2910 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2912 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2914 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2916 if (!dst_hold_safe(&rt->dst))
2920 /* if gateway was specified only delete the one hop */
2921 if (cfg->fc_flags & RTF_GATEWAY)
2922 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2924 return __ip6_del_rt_siblings(rt, cfg);
2932 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2934 struct netevent_redirect netevent;
2935 struct rt6_info *rt, *nrt = NULL;
2936 struct ndisc_options ndopts;
2937 struct inet6_dev *in6_dev;
2938 struct neighbour *neigh;
2940 int optlen, on_link;
2943 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2944 optlen -= sizeof(*msg);
2947 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2951 msg = (struct rd_msg *)icmp6_hdr(skb);
2953 if (ipv6_addr_is_multicast(&msg->dest)) {
2954 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2959 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2961 } else if (ipv6_addr_type(&msg->target) !=
2962 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2963 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2967 in6_dev = __in6_dev_get(skb->dev);
2970 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2974 * The IP source address of the Redirect MUST be the same as the current
2975 * first-hop router for the specified ICMP Destination Address.
2978 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2979 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2984 if (ndopts.nd_opts_tgt_lladdr) {
2985 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2988 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2993 rt = (struct rt6_info *) dst;
2994 if (rt->rt6i_flags & RTF_REJECT) {
2995 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2999 /* Redirect received -> path was valid.
3000 * Look, redirects are sent only in response to data packets,
3001 * so that this nexthop apparently is reachable. --ANK
3003 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3005 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3010 * We have finally decided to accept it.
3013 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3014 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3015 NEIGH_UPDATE_F_OVERRIDE|
3016 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3017 NEIGH_UPDATE_F_ISROUTER)),
3018 NDISC_REDIRECT, &ndopts);
3020 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3024 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3026 nrt->rt6i_flags &= ~RTF_GATEWAY;
3028 nrt->rt6i_protocol = RTPROT_REDIRECT;
3029 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3031 /* No need to remove rt from the exception table if rt is
3032 * a cached route because rt6_insert_exception() will
3035 if (rt6_insert_exception(nrt, rt)) {
3036 dst_release_immediate(&nrt->dst);
3040 netevent.old = &rt->dst;
3041 netevent.new = &nrt->dst;
3042 netevent.daddr = &msg->dest;
3043 netevent.neigh = neigh;
3044 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3047 neigh_release(neigh);
3051 * Misc support functions
3054 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3056 BUG_ON(from->dst.from);
3058 rt->rt6i_flags &= ~RTF_EXPIRES;
3059 dst_hold(&from->dst);
3060 rt->dst.from = &from->dst;
3061 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3064 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3066 rt->dst.input = ort->dst.input;
3067 rt->dst.output = ort->dst.output;
3068 rt->rt6i_dst = ort->rt6i_dst;
3069 rt->dst.error = ort->dst.error;
3070 rt->rt6i_idev = ort->rt6i_idev;
3072 in6_dev_hold(rt->rt6i_idev);
3073 rt->dst.lastuse = jiffies;
3074 rt->rt6i_gateway = ort->rt6i_gateway;
3075 rt->rt6i_flags = ort->rt6i_flags;
3076 rt6_set_from(rt, ort);
3077 rt->rt6i_metric = ort->rt6i_metric;
3078 #ifdef CONFIG_IPV6_SUBTREES
3079 rt->rt6i_src = ort->rt6i_src;
3081 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3082 rt->rt6i_table = ort->rt6i_table;
3083 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3086 #ifdef CONFIG_IPV6_ROUTE_INFO
3087 static struct rt6_info *rt6_get_route_info(struct net *net,
3088 const struct in6_addr *prefix, int prefixlen,
3089 const struct in6_addr *gwaddr,
3090 struct net_device *dev)
3092 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3093 int ifindex = dev->ifindex;
3094 struct fib6_node *fn;
3095 struct rt6_info *rt = NULL;
3096 struct fib6_table *table;
3098 table = fib6_get_table(net, tb_id);
3103 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3107 for_each_fib6_node_rt_rcu(fn) {
3108 if (rt->dst.dev->ifindex != ifindex)
3110 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3112 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3114 ip6_hold_safe(NULL, &rt, false);
3122 static struct rt6_info *rt6_add_route_info(struct net *net,
3123 const struct in6_addr *prefix, int prefixlen,
3124 const struct in6_addr *gwaddr,
3125 struct net_device *dev,
3128 struct fib6_config cfg = {
3129 .fc_metric = IP6_RT_PRIO_USER,
3130 .fc_ifindex = dev->ifindex,
3131 .fc_dst_len = prefixlen,
3132 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3133 RTF_UP | RTF_PREF(pref),
3134 .fc_protocol = RTPROT_RA,
3135 .fc_nlinfo.portid = 0,
3136 .fc_nlinfo.nlh = NULL,
3137 .fc_nlinfo.nl_net = net,
3140 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3141 cfg.fc_dst = *prefix;
3142 cfg.fc_gateway = *gwaddr;
3144 /* We should treat it as a default route if prefix length is 0. */
3146 cfg.fc_flags |= RTF_DEFAULT;
3148 ip6_route_add(&cfg, NULL);
3150 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3154 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3156 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3157 struct rt6_info *rt;
3158 struct fib6_table *table;
3160 table = fib6_get_table(dev_net(dev), tb_id);
3165 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3166 if (dev == rt->dst.dev &&
3167 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3168 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3172 ip6_hold_safe(NULL, &rt, false);
3177 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3178 struct net_device *dev,
3181 struct fib6_config cfg = {
3182 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3183 .fc_metric = IP6_RT_PRIO_USER,
3184 .fc_ifindex = dev->ifindex,
3185 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3186 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3187 .fc_protocol = RTPROT_RA,
3188 .fc_nlinfo.portid = 0,
3189 .fc_nlinfo.nlh = NULL,
3190 .fc_nlinfo.nl_net = dev_net(dev),
3193 cfg.fc_gateway = *gwaddr;
3195 if (!ip6_route_add(&cfg, NULL)) {
3196 struct fib6_table *table;
3198 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3200 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3203 return rt6_get_dflt_router(gwaddr, dev);
3206 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3208 struct rt6_info *rt;
3212 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3213 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3214 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3215 if (dst_hold_safe(&rt->dst)) {
3226 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3229 void rt6_purge_dflt_routers(struct net *net)
3231 struct fib6_table *table;
3232 struct hlist_head *head;
3237 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3238 head = &net->ipv6.fib_table_hash[h];
3239 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3240 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3241 __rt6_purge_dflt_routers(table);
3248 static void rtmsg_to_fib6_config(struct net *net,
3249 struct in6_rtmsg *rtmsg,
3250 struct fib6_config *cfg)
3252 memset(cfg, 0, sizeof(*cfg));
3254 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3256 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3257 cfg->fc_metric = rtmsg->rtmsg_metric;
3258 cfg->fc_expires = rtmsg->rtmsg_info;
3259 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3260 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3261 cfg->fc_flags = rtmsg->rtmsg_flags;
3263 cfg->fc_nlinfo.nl_net = net;
3265 cfg->fc_dst = rtmsg->rtmsg_dst;
3266 cfg->fc_src = rtmsg->rtmsg_src;
3267 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3270 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3272 struct fib6_config cfg;
3273 struct in6_rtmsg rtmsg;
3277 case SIOCADDRT: /* Add a route */
3278 case SIOCDELRT: /* Delete a route */
3279 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3281 err = copy_from_user(&rtmsg, arg,
3282 sizeof(struct in6_rtmsg));
3286 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3291 err = ip6_route_add(&cfg, NULL);
3294 err = ip6_route_del(&cfg, NULL);
3308 * Drop the packet on the floor
3311 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3314 struct dst_entry *dst = skb_dst(skb);
3315 switch (ipstats_mib_noroutes) {
3316 case IPSTATS_MIB_INNOROUTES:
3317 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3318 if (type == IPV6_ADDR_ANY) {
3319 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3320 IPSTATS_MIB_INADDRERRORS);
3324 case IPSTATS_MIB_OUTNOROUTES:
3325 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3326 ipstats_mib_noroutes);
3329 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3334 static int ip6_pkt_discard(struct sk_buff *skb)
3336 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3339 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3341 skb->dev = skb_dst(skb)->dev;
3342 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3345 static int ip6_pkt_prohibit(struct sk_buff *skb)
3347 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3350 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3352 skb->dev = skb_dst(skb)->dev;
3353 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3357 * Allocate a dst for local (unicast / anycast) address.
3360 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3361 const struct in6_addr *addr,
3365 struct net *net = dev_net(idev->dev);
3366 struct net_device *dev = idev->dev;
3367 struct rt6_info *rt;
3369 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3371 return ERR_PTR(-ENOMEM);
3375 rt->dst.flags |= DST_HOST;
3376 rt->dst.input = ip6_input;
3377 rt->dst.output = ip6_output;
3378 rt->rt6i_idev = idev;
3380 rt->rt6i_protocol = RTPROT_KERNEL;
3381 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3383 rt->rt6i_flags |= RTF_ANYCAST;
3385 rt->rt6i_flags |= RTF_LOCAL;
3387 rt->rt6i_gateway = *addr;
3388 rt->rt6i_dst.addr = *addr;
3389 rt->rt6i_dst.plen = 128;
3390 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3391 rt->rt6i_table = fib6_get_table(net, tb_id);
3396 /* remove deleted ip from prefsrc entries */
3397 struct arg_dev_net_ip {
3398 struct net_device *dev;
3400 struct in6_addr *addr;
3403 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3405 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3406 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3407 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3409 if (((void *)rt->dst.dev == dev || !dev) &&
3410 rt != net->ipv6.ip6_null_entry &&
3411 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3412 spin_lock_bh(&rt6_exception_lock);
3413 /* remove prefsrc entry */
3414 rt->rt6i_prefsrc.plen = 0;
3415 /* need to update cache as well */
3416 rt6_exceptions_remove_prefsrc(rt);
3417 spin_unlock_bh(&rt6_exception_lock);
3422 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3424 struct net *net = dev_net(ifp->idev->dev);
3425 struct arg_dev_net_ip adni = {
3426 .dev = ifp->idev->dev,
3430 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3433 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3435 /* Remove routers and update dst entries when gateway turn into host. */
3436 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3438 struct in6_addr *gateway = (struct in6_addr *)arg;
3440 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3441 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3445 /* Further clean up cached routes in exception table.
3446 * This is needed because cached route may have a different
3447 * gateway than its 'parent' in the case of an ip redirect.
3449 rt6_exceptions_clean_tohost(rt, gateway);
3454 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3456 fib6_clean_all(net, fib6_clean_tohost, gateway);
3459 struct arg_dev_net {
3460 struct net_device *dev;
3464 /* called with write lock held for table with rt */
3465 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3467 const struct arg_dev_net *adn = arg;
3468 const struct net_device *dev = adn->dev;
3470 if ((rt->dst.dev == dev || !dev) &&
3471 rt != adn->net->ipv6.ip6_null_entry &&
3472 (rt->rt6i_nsiblings == 0 ||
3473 (dev && netdev_unregistering(dev)) ||
3474 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3480 void rt6_ifdown(struct net *net, struct net_device *dev)
3482 struct arg_dev_net adn = {
3487 fib6_clean_all(net, fib6_ifdown, &adn);
3489 rt6_uncached_list_flush_dev(net, dev);
3492 struct rt6_mtu_change_arg {
3493 struct net_device *dev;
3497 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3499 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3500 struct inet6_dev *idev;
3502 /* In IPv6 pmtu discovery is not optional,
3503 so that RTAX_MTU lock cannot disable it.
3504 We still use this lock to block changes
3505 caused by addrconf/ndisc.
3508 idev = __in6_dev_get(arg->dev);
3512 /* For administrative MTU increase, there is no way to discover
3513 IPv6 PMTU increase, so PMTU increase should be updated here.
3514 Since RFC 1981 doesn't include administrative MTU increase
3515 update PMTU increase is a MUST. (i.e. jumbo frame)
3518 If new MTU is less than route PMTU, this new MTU will be the
3519 lowest MTU in the path, update the route PMTU to reflect PMTU
3520 decreases; if new MTU is greater than route PMTU, and the
3521 old MTU is the lowest MTU in the path, update the route PMTU
3522 to reflect the increase. In this case if the other nodes' MTU
3523 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3526 if (rt->dst.dev == arg->dev &&
3527 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3528 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3529 spin_lock_bh(&rt6_exception_lock);
3530 if (dst_mtu(&rt->dst) >= arg->mtu ||
3531 (dst_mtu(&rt->dst) < arg->mtu &&
3532 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3533 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3535 rt6_exceptions_update_pmtu(rt, arg->mtu);
3536 spin_unlock_bh(&rt6_exception_lock);
3541 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3543 struct rt6_mtu_change_arg arg = {
3548 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3551 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3552 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3553 [RTA_OIF] = { .type = NLA_U32 },
3554 [RTA_IIF] = { .type = NLA_U32 },
3555 [RTA_PRIORITY] = { .type = NLA_U32 },
3556 [RTA_METRICS] = { .type = NLA_NESTED },
3557 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3558 [RTA_PREF] = { .type = NLA_U8 },
3559 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3560 [RTA_ENCAP] = { .type = NLA_NESTED },
3561 [RTA_EXPIRES] = { .type = NLA_U32 },
3562 [RTA_UID] = { .type = NLA_U32 },
3563 [RTA_MARK] = { .type = NLA_U32 },
3566 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3567 struct fib6_config *cfg,
3568 struct netlink_ext_ack *extack)
3571 struct nlattr *tb[RTA_MAX+1];
3575 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3581 rtm = nlmsg_data(nlh);
3582 memset(cfg, 0, sizeof(*cfg));
3584 cfg->fc_table = rtm->rtm_table;
3585 cfg->fc_dst_len = rtm->rtm_dst_len;
3586 cfg->fc_src_len = rtm->rtm_src_len;
3587 cfg->fc_flags = RTF_UP;
3588 cfg->fc_protocol = rtm->rtm_protocol;
3589 cfg->fc_type = rtm->rtm_type;
3591 if (rtm->rtm_type == RTN_UNREACHABLE ||
3592 rtm->rtm_type == RTN_BLACKHOLE ||
3593 rtm->rtm_type == RTN_PROHIBIT ||
3594 rtm->rtm_type == RTN_THROW)
3595 cfg->fc_flags |= RTF_REJECT;
3597 if (rtm->rtm_type == RTN_LOCAL)
3598 cfg->fc_flags |= RTF_LOCAL;
3600 if (rtm->rtm_flags & RTM_F_CLONED)
3601 cfg->fc_flags |= RTF_CACHE;
3603 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3604 cfg->fc_nlinfo.nlh = nlh;
3605 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3607 if (tb[RTA_GATEWAY]) {
3608 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3609 cfg->fc_flags |= RTF_GATEWAY;
3613 int plen = (rtm->rtm_dst_len + 7) >> 3;
3615 if (nla_len(tb[RTA_DST]) < plen)
3618 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3622 int plen = (rtm->rtm_src_len + 7) >> 3;
3624 if (nla_len(tb[RTA_SRC]) < plen)
3627 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3630 if (tb[RTA_PREFSRC])
3631 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3634 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3636 if (tb[RTA_PRIORITY])
3637 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3639 if (tb[RTA_METRICS]) {
3640 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3641 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3645 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3647 if (tb[RTA_MULTIPATH]) {
3648 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3649 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3651 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3652 cfg->fc_mp_len, extack);
3658 pref = nla_get_u8(tb[RTA_PREF]);
3659 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3660 pref != ICMPV6_ROUTER_PREF_HIGH)
3661 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3662 cfg->fc_flags |= RTF_PREF(pref);
3666 cfg->fc_encap = tb[RTA_ENCAP];
3668 if (tb[RTA_ENCAP_TYPE]) {
3669 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3671 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3676 if (tb[RTA_EXPIRES]) {
3677 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3679 if (addrconf_finite_timeout(timeout)) {
3680 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3681 cfg->fc_flags |= RTF_EXPIRES;
3691 struct rt6_info *rt6_info;
3692 struct fib6_config r_cfg;
3693 struct mx6_config mxc;
3694 struct list_head next;
3697 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3701 list_for_each_entry(nh, rt6_nh_list, next) {
3702 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3703 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3704 nh->r_cfg.fc_ifindex);
3708 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3709 struct rt6_info *rt, struct fib6_config *r_cfg)
3714 list_for_each_entry(nh, rt6_nh_list, next) {
3715 /* check if rt6_info already exists */
3716 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3720 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3724 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3729 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3730 list_add_tail(&nh->next, rt6_nh_list);
3735 static void ip6_route_mpath_notify(struct rt6_info *rt,
3736 struct rt6_info *rt_last,
3737 struct nl_info *info,
3740 /* if this is an APPEND route, then rt points to the first route
3741 * inserted and rt_last points to last route inserted. Userspace
3742 * wants a consistent dump of the route which starts at the first
3743 * nexthop. Since sibling routes are always added at the end of
3744 * the list, find the first sibling of the last route appended
3746 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3747 rt = list_first_entry(&rt_last->rt6i_siblings,
3753 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3756 static int ip6_route_multipath_add(struct fib6_config *cfg,
3757 struct netlink_ext_ack *extack)
3759 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3760 struct nl_info *info = &cfg->fc_nlinfo;
3761 struct fib6_config r_cfg;
3762 struct rtnexthop *rtnh;
3763 struct rt6_info *rt;
3764 struct rt6_nh *err_nh;
3765 struct rt6_nh *nh, *nh_safe;
3771 int replace = (cfg->fc_nlinfo.nlh &&
3772 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3773 LIST_HEAD(rt6_nh_list);
3775 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3776 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3777 nlflags |= NLM_F_APPEND;
3779 remaining = cfg->fc_mp_len;
3780 rtnh = (struct rtnexthop *)cfg->fc_mp;
3782 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3783 * rt6_info structs per nexthop
3785 while (rtnh_ok(rtnh, remaining)) {
3786 memcpy(&r_cfg, cfg, sizeof(*cfg));
3787 if (rtnh->rtnh_ifindex)
3788 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3790 attrlen = rtnh_attrlen(rtnh);
3792 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3794 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3796 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3797 r_cfg.fc_flags |= RTF_GATEWAY;
3799 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3800 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3802 r_cfg.fc_encap_type = nla_get_u16(nla);
3805 rt = ip6_route_info_create(&r_cfg, extack);
3812 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3814 dst_release_immediate(&rt->dst);
3818 rtnh = rtnh_next(rtnh, &remaining);
3821 /* for add and replace send one notification with all nexthops.
3822 * Skip the notification in fib6_add_rt2node and send one with
3823 * the full route when done
3825 info->skip_notify = 1;
3828 list_for_each_entry(nh, &rt6_nh_list, next) {
3829 rt_last = nh->rt6_info;
3830 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3831 /* save reference to first route for notification */
3832 if (!rt_notif && !err)
3833 rt_notif = nh->rt6_info;
3835 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3836 nh->rt6_info = NULL;
3839 ip6_print_replace_route_err(&rt6_nh_list);
3844 /* Because each route is added like a single route we remove
3845 * these flags after the first nexthop: if there is a collision,
3846 * we have already failed to add the first nexthop:
3847 * fib6_add_rt2node() has rejected it; when replacing, old
3848 * nexthops have been replaced by first new, the rest should
3851 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3856 /* success ... tell user about new route */
3857 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3861 /* send notification for routes that were added so that
3862 * the delete notifications sent by ip6_route_del are
3866 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3868 /* Delete routes that were already added */
3869 list_for_each_entry(nh, &rt6_nh_list, next) {
3872 ip6_route_del(&nh->r_cfg, extack);
3876 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3878 dst_release_immediate(&nh->rt6_info->dst);
3880 list_del(&nh->next);
3887 static int ip6_route_multipath_del(struct fib6_config *cfg,
3888 struct netlink_ext_ack *extack)
3890 struct fib6_config r_cfg;
3891 struct rtnexthop *rtnh;
3894 int err = 1, last_err = 0;
3896 remaining = cfg->fc_mp_len;
3897 rtnh = (struct rtnexthop *)cfg->fc_mp;
3899 /* Parse a Multipath Entry */
3900 while (rtnh_ok(rtnh, remaining)) {
3901 memcpy(&r_cfg, cfg, sizeof(*cfg));
3902 if (rtnh->rtnh_ifindex)
3903 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3905 attrlen = rtnh_attrlen(rtnh);
3907 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3909 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3911 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3912 r_cfg.fc_flags |= RTF_GATEWAY;
3915 err = ip6_route_del(&r_cfg, extack);
3919 rtnh = rtnh_next(rtnh, &remaining);
3925 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3926 struct netlink_ext_ack *extack)
3928 struct fib6_config cfg;
3931 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3936 return ip6_route_multipath_del(&cfg, extack);
3938 cfg.fc_delete_all_nh = 1;
3939 return ip6_route_del(&cfg, extack);
3943 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3944 struct netlink_ext_ack *extack)
3946 struct fib6_config cfg;
3949 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3954 return ip6_route_multipath_add(&cfg, extack);
3956 return ip6_route_add(&cfg, extack);
3959 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3961 int nexthop_len = 0;
3963 if (rt->rt6i_nsiblings) {
3964 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3965 + NLA_ALIGN(sizeof(struct rtnexthop))
3966 + nla_total_size(16) /* RTA_GATEWAY */
3967 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3969 nexthop_len *= rt->rt6i_nsiblings;
3972 return NLMSG_ALIGN(sizeof(struct rtmsg))
3973 + nla_total_size(16) /* RTA_SRC */
3974 + nla_total_size(16) /* RTA_DST */
3975 + nla_total_size(16) /* RTA_GATEWAY */
3976 + nla_total_size(16) /* RTA_PREFSRC */
3977 + nla_total_size(4) /* RTA_TABLE */
3978 + nla_total_size(4) /* RTA_IIF */
3979 + nla_total_size(4) /* RTA_OIF */
3980 + nla_total_size(4) /* RTA_PRIORITY */
3981 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3982 + nla_total_size(sizeof(struct rta_cacheinfo))
3983 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3984 + nla_total_size(1) /* RTA_PREF */
3985 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3989 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3990 unsigned int *flags, bool skip_oif)
3992 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3993 *flags |= RTNH_F_LINKDOWN;
3994 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3995 *flags |= RTNH_F_DEAD;
3998 if (rt->rt6i_flags & RTF_GATEWAY) {
3999 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4000 goto nla_put_failure;
4003 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4004 *flags |= RTNH_F_OFFLOAD;
4006 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4007 if (!skip_oif && rt->dst.dev &&
4008 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4009 goto nla_put_failure;
4011 if (rt->dst.lwtstate &&
4012 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4013 goto nla_put_failure;
4021 /* add multipath next hop */
4022 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4024 struct rtnexthop *rtnh;
4025 unsigned int flags = 0;
4027 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4029 goto nla_put_failure;
4031 rtnh->rtnh_hops = 0;
4032 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4034 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4035 goto nla_put_failure;
4037 rtnh->rtnh_flags = flags;
4039 /* length of rtnetlink header + attributes */
4040 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4048 static int rt6_fill_node(struct net *net,
4049 struct sk_buff *skb, struct rt6_info *rt,
4050 struct in6_addr *dst, struct in6_addr *src,
4051 int iif, int type, u32 portid, u32 seq,
4054 u32 metrics[RTAX_MAX];
4056 struct nlmsghdr *nlh;
4060 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4064 rtm = nlmsg_data(nlh);
4065 rtm->rtm_family = AF_INET6;
4066 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4067 rtm->rtm_src_len = rt->rt6i_src.plen;
4070 table = rt->rt6i_table->tb6_id;
4072 table = RT6_TABLE_UNSPEC;
4073 rtm->rtm_table = table;
4074 if (nla_put_u32(skb, RTA_TABLE, table))
4075 goto nla_put_failure;
4076 if (rt->rt6i_flags & RTF_REJECT) {
4077 switch (rt->dst.error) {
4079 rtm->rtm_type = RTN_BLACKHOLE;
4082 rtm->rtm_type = RTN_PROHIBIT;
4085 rtm->rtm_type = RTN_THROW;
4088 rtm->rtm_type = RTN_UNREACHABLE;
4092 else if (rt->rt6i_flags & RTF_LOCAL)
4093 rtm->rtm_type = RTN_LOCAL;
4094 else if (rt->rt6i_flags & RTF_ANYCAST)
4095 rtm->rtm_type = RTN_ANYCAST;
4096 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4097 rtm->rtm_type = RTN_LOCAL;
4099 rtm->rtm_type = RTN_UNICAST;
4101 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4102 rtm->rtm_protocol = rt->rt6i_protocol;
4104 if (rt->rt6i_flags & RTF_CACHE)
4105 rtm->rtm_flags |= RTM_F_CLONED;
4108 if (nla_put_in6_addr(skb, RTA_DST, dst))
4109 goto nla_put_failure;
4110 rtm->rtm_dst_len = 128;
4111 } else if (rtm->rtm_dst_len)
4112 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4113 goto nla_put_failure;
4114 #ifdef CONFIG_IPV6_SUBTREES
4116 if (nla_put_in6_addr(skb, RTA_SRC, src))
4117 goto nla_put_failure;
4118 rtm->rtm_src_len = 128;
4119 } else if (rtm->rtm_src_len &&
4120 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4121 goto nla_put_failure;
4124 #ifdef CONFIG_IPV6_MROUTE
4125 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4126 int err = ip6mr_get_route(net, skb, rtm, portid);
4131 goto nla_put_failure;
4134 if (nla_put_u32(skb, RTA_IIF, iif))
4135 goto nla_put_failure;
4137 struct in6_addr saddr_buf;
4138 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4139 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4140 goto nla_put_failure;
4143 if (rt->rt6i_prefsrc.plen) {
4144 struct in6_addr saddr_buf;
4145 saddr_buf = rt->rt6i_prefsrc.addr;
4146 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4147 goto nla_put_failure;
4150 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4152 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4153 if (rtnetlink_put_metrics(skb, metrics) < 0)
4154 goto nla_put_failure;
4156 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4157 goto nla_put_failure;
4159 /* For multipath routes, walk the siblings list and add
4160 * each as a nexthop within RTA_MULTIPATH.
4162 if (rt->rt6i_nsiblings) {
4163 struct rt6_info *sibling, *next_sibling;
4166 mp = nla_nest_start(skb, RTA_MULTIPATH);
4168 goto nla_put_failure;
4170 if (rt6_add_nexthop(skb, rt) < 0)
4171 goto nla_put_failure;
4173 list_for_each_entry_safe(sibling, next_sibling,
4174 &rt->rt6i_siblings, rt6i_siblings) {
4175 if (rt6_add_nexthop(skb, sibling) < 0)
4176 goto nla_put_failure;
4179 nla_nest_end(skb, mp);
4181 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4182 goto nla_put_failure;
4185 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4187 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4188 goto nla_put_failure;
4190 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4191 goto nla_put_failure;
4194 nlmsg_end(skb, nlh);
4198 nlmsg_cancel(skb, nlh);
4202 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4204 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4205 struct net *net = arg->net;
4207 if (rt == net->ipv6.ip6_null_entry)
4210 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4211 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4213 /* user wants prefix routes only */
4214 if (rtm->rtm_flags & RTM_F_PREFIX &&
4215 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4216 /* success since this is not a prefix route */
4221 return rt6_fill_node(net,
4222 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4223 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4227 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4228 struct netlink_ext_ack *extack)
4230 struct net *net = sock_net(in_skb->sk);
4231 struct nlattr *tb[RTA_MAX+1];
4232 int err, iif = 0, oif = 0;
4233 struct dst_entry *dst;
4234 struct rt6_info *rt;
4235 struct sk_buff *skb;
4240 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4246 memset(&fl6, 0, sizeof(fl6));
4247 rtm = nlmsg_data(nlh);
4248 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4249 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4252 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4255 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4259 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4262 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4266 iif = nla_get_u32(tb[RTA_IIF]);
4269 oif = nla_get_u32(tb[RTA_OIF]);
4272 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4275 fl6.flowi6_uid = make_kuid(current_user_ns(),
4276 nla_get_u32(tb[RTA_UID]));
4278 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4281 struct net_device *dev;
4286 dev = dev_get_by_index_rcu(net, iif);
4293 fl6.flowi6_iif = iif;
4295 if (!ipv6_addr_any(&fl6.saddr))
4296 flags |= RT6_LOOKUP_F_HAS_SADDR;
4299 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4301 dst = ip6_route_lookup(net, &fl6, 0);
4305 fl6.flowi6_oif = oif;
4308 dst = ip6_route_output(net, NULL, &fl6);
4310 dst = ip6_route_lookup(net, &fl6, 0);
4314 rt = container_of(dst, struct rt6_info, dst);
4315 if (rt->dst.error) {
4316 err = rt->dst.error;
4321 if (rt == net->ipv6.ip6_null_entry) {
4322 err = rt->dst.error;
4327 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4334 skb_dst_set(skb, &rt->dst);
4336 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4337 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4340 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4341 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4348 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4353 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4354 unsigned int nlm_flags)
4356 struct sk_buff *skb;
4357 struct net *net = info->nl_net;
4362 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4364 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4368 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4369 event, info->portid, seq, nlm_flags);
4371 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4372 WARN_ON(err == -EMSGSIZE);
4376 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4377 info->nlh, gfp_any());
4381 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4384 static int ip6_route_dev_notify(struct notifier_block *this,
4385 unsigned long event, void *ptr)
4387 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4388 struct net *net = dev_net(dev);
4390 if (!(dev->flags & IFF_LOOPBACK))
4393 if (event == NETDEV_REGISTER) {
4394 net->ipv6.ip6_null_entry->dst.dev = dev;
4395 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4396 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4397 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4398 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4399 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4400 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4402 } else if (event == NETDEV_UNREGISTER &&
4403 dev->reg_state != NETREG_UNREGISTERED) {
4404 /* NETDEV_UNREGISTER could be fired for multiple times by
4405 * netdev_wait_allrefs(). Make sure we only call this once.
4407 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4408 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4409 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4410 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4421 #ifdef CONFIG_PROC_FS
4423 static const struct file_operations ipv6_route_proc_fops = {
4424 .owner = THIS_MODULE,
4425 .open = ipv6_route_open,
4427 .llseek = seq_lseek,
4428 .release = seq_release_net,
4431 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4433 struct net *net = (struct net *)seq->private;
4434 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4435 net->ipv6.rt6_stats->fib_nodes,
4436 net->ipv6.rt6_stats->fib_route_nodes,
4437 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4438 net->ipv6.rt6_stats->fib_rt_entries,
4439 net->ipv6.rt6_stats->fib_rt_cache,
4440 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4441 net->ipv6.rt6_stats->fib_discarded_routes);
4446 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4448 return single_open_net(inode, file, rt6_stats_seq_show);
4451 static const struct file_operations rt6_stats_seq_fops = {
4452 .owner = THIS_MODULE,
4453 .open = rt6_stats_seq_open,
4455 .llseek = seq_lseek,
4456 .release = single_release_net,
4458 #endif /* CONFIG_PROC_FS */
4460 #ifdef CONFIG_SYSCTL
4463 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4464 void __user *buffer, size_t *lenp, loff_t *ppos)
4471 net = (struct net *)ctl->extra1;
4472 delay = net->ipv6.sysctl.flush_delay;
4473 proc_dointvec(ctl, write, buffer, lenp, ppos);
4474 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4478 struct ctl_table ipv6_route_table_template[] = {
4480 .procname = "flush",
4481 .data = &init_net.ipv6.sysctl.flush_delay,
4482 .maxlen = sizeof(int),
4484 .proc_handler = ipv6_sysctl_rtcache_flush
4487 .procname = "gc_thresh",
4488 .data = &ip6_dst_ops_template.gc_thresh,
4489 .maxlen = sizeof(int),
4491 .proc_handler = proc_dointvec,
4494 .procname = "max_size",
4495 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4496 .maxlen = sizeof(int),
4498 .proc_handler = proc_dointvec,
4501 .procname = "gc_min_interval",
4502 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4503 .maxlen = sizeof(int),
4505 .proc_handler = proc_dointvec_jiffies,
4508 .procname = "gc_timeout",
4509 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4510 .maxlen = sizeof(int),
4512 .proc_handler = proc_dointvec_jiffies,
4515 .procname = "gc_interval",
4516 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4517 .maxlen = sizeof(int),
4519 .proc_handler = proc_dointvec_jiffies,
4522 .procname = "gc_elasticity",
4523 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4524 .maxlen = sizeof(int),
4526 .proc_handler = proc_dointvec,
4529 .procname = "mtu_expires",
4530 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4531 .maxlen = sizeof(int),
4533 .proc_handler = proc_dointvec_jiffies,
4536 .procname = "min_adv_mss",
4537 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4538 .maxlen = sizeof(int),
4540 .proc_handler = proc_dointvec,
4543 .procname = "gc_min_interval_ms",
4544 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4545 .maxlen = sizeof(int),
4547 .proc_handler = proc_dointvec_ms_jiffies,
4552 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4554 struct ctl_table *table;
4556 table = kmemdup(ipv6_route_table_template,
4557 sizeof(ipv6_route_table_template),
4561 table[0].data = &net->ipv6.sysctl.flush_delay;
4562 table[0].extra1 = net;
4563 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4564 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4565 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4566 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4567 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4568 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4569 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4570 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4571 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4573 /* Don't export sysctls to unprivileged users */
4574 if (net->user_ns != &init_user_ns)
4575 table[0].procname = NULL;
4582 static int __net_init ip6_route_net_init(struct net *net)
4586 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4587 sizeof(net->ipv6.ip6_dst_ops));
4589 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4590 goto out_ip6_dst_ops;
4592 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4593 sizeof(*net->ipv6.ip6_null_entry),
4595 if (!net->ipv6.ip6_null_entry)
4596 goto out_ip6_dst_entries;
4597 net->ipv6.ip6_null_entry->dst.path =
4598 (struct dst_entry *)net->ipv6.ip6_null_entry;
4599 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4600 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4601 ip6_template_metrics, true);
4603 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4604 net->ipv6.fib6_has_custom_rules = false;
4605 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4606 sizeof(*net->ipv6.ip6_prohibit_entry),
4608 if (!net->ipv6.ip6_prohibit_entry)
4609 goto out_ip6_null_entry;
4610 net->ipv6.ip6_prohibit_entry->dst.path =
4611 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4612 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4613 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4614 ip6_template_metrics, true);
4616 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4617 sizeof(*net->ipv6.ip6_blk_hole_entry),
4619 if (!net->ipv6.ip6_blk_hole_entry)
4620 goto out_ip6_prohibit_entry;
4621 net->ipv6.ip6_blk_hole_entry->dst.path =
4622 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4623 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4624 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4625 ip6_template_metrics, true);
4628 net->ipv6.sysctl.flush_delay = 0;
4629 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4630 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4631 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4632 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4633 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4634 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4635 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4637 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4643 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4644 out_ip6_prohibit_entry:
4645 kfree(net->ipv6.ip6_prohibit_entry);
4647 kfree(net->ipv6.ip6_null_entry);
4649 out_ip6_dst_entries:
4650 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4655 static void __net_exit ip6_route_net_exit(struct net *net)
4657 kfree(net->ipv6.ip6_null_entry);
4658 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4659 kfree(net->ipv6.ip6_prohibit_entry);
4660 kfree(net->ipv6.ip6_blk_hole_entry);
4662 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4665 static int __net_init ip6_route_net_init_late(struct net *net)
4667 #ifdef CONFIG_PROC_FS
4668 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4669 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4674 static void __net_exit ip6_route_net_exit_late(struct net *net)
4676 #ifdef CONFIG_PROC_FS
4677 remove_proc_entry("ipv6_route", net->proc_net);
4678 remove_proc_entry("rt6_stats", net->proc_net);
4682 static struct pernet_operations ip6_route_net_ops = {
4683 .init = ip6_route_net_init,
4684 .exit = ip6_route_net_exit,
4687 static int __net_init ipv6_inetpeer_init(struct net *net)
4689 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4693 inet_peer_base_init(bp);
4694 net->ipv6.peers = bp;
4698 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4700 struct inet_peer_base *bp = net->ipv6.peers;
4702 net->ipv6.peers = NULL;
4703 inetpeer_invalidate_tree(bp);
4707 static struct pernet_operations ipv6_inetpeer_ops = {
4708 .init = ipv6_inetpeer_init,
4709 .exit = ipv6_inetpeer_exit,
4712 static struct pernet_operations ip6_route_net_late_ops = {
4713 .init = ip6_route_net_init_late,
4714 .exit = ip6_route_net_exit_late,
4717 static struct notifier_block ip6_route_dev_notifier = {
4718 .notifier_call = ip6_route_dev_notify,
4719 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4722 void __init ip6_route_init_special_entries(void)
4724 /* Registering of the loopback is done before this portion of code,
4725 * the loopback reference in rt6_info will not be taken, do it
4726 * manually for init_net */
4727 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4728 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4730 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4731 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4732 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4733 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4737 int __init ip6_route_init(void)
4743 ip6_dst_ops_template.kmem_cachep =
4744 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4745 SLAB_HWCACHE_ALIGN, NULL);
4746 if (!ip6_dst_ops_template.kmem_cachep)
4749 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4751 goto out_kmem_cache;
4753 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4755 goto out_dst_entries;
4757 ret = register_pernet_subsys(&ip6_route_net_ops);
4759 goto out_register_inetpeer;
4761 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4765 goto out_register_subsys;
4771 ret = fib6_rules_init();
4775 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4777 goto fib6_rules_init;
4780 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4781 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4782 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4783 RTNL_FLAG_DOIT_UNLOCKED))
4784 goto out_register_late_subsys;
4786 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4788 goto out_register_late_subsys;
4790 for_each_possible_cpu(cpu) {
4791 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4793 INIT_LIST_HEAD(&ul->head);
4794 spin_lock_init(&ul->lock);
4800 out_register_late_subsys:
4801 unregister_pernet_subsys(&ip6_route_net_late_ops);
4803 fib6_rules_cleanup();
4808 out_register_subsys:
4809 unregister_pernet_subsys(&ip6_route_net_ops);
4810 out_register_inetpeer:
4811 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4813 dst_entries_destroy(&ip6_dst_blackhole_ops);
4815 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4819 void ip6_route_cleanup(void)
4821 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4822 unregister_pernet_subsys(&ip6_route_net_late_ops);
4823 fib6_rules_cleanup();
4826 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4827 unregister_pernet_subsys(&ip6_route_net_ops);
4828 dst_entries_destroy(&ip6_dst_blackhole_ops);
4829 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);