2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct rt6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 struct rt6_info *rt, struct dst_entry *dst,
103 struct in6_addr *dest, struct in6_addr *src,
104 int iif, int type, u32 portid, u32 seq,
106 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
107 struct in6_addr *daddr,
108 struct in6_addr *saddr);
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct rt6_info *rt6_add_route_info(struct net *net,
112 const struct in6_addr *prefix, int prefixlen,
113 const struct in6_addr *gwaddr,
114 struct net_device *dev,
116 static struct rt6_info *rt6_get_route_info(struct net *net,
117 const struct in6_addr *prefix, int prefixlen,
118 const struct in6_addr *gwaddr,
119 struct net_device *dev);
122 struct uncached_list {
124 struct list_head head;
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
129 void rt6_uncached_list_add(struct rt6_info *rt)
131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
133 rt->rt6i_uncached_list = ul;
135 spin_lock_bh(&ul->lock);
136 list_add_tail(&rt->rt6i_uncached, &ul->head);
137 spin_unlock_bh(&ul->lock);
140 void rt6_uncached_list_del(struct rt6_info *rt)
142 if (!list_empty(&rt->rt6i_uncached)) {
143 struct uncached_list *ul = rt->rt6i_uncached_list;
144 struct net *net = dev_net(rt->dst.dev);
146 spin_lock_bh(&ul->lock);
147 list_del(&rt->rt6i_uncached);
148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 if (!ipv6_addr_any(p))
190 return (const void *) p;
192 return &ipv6_hdr(skb)->daddr;
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 struct net_device *dev,
203 daddr = choose_neigh_daddr(gw, skb, daddr);
204 n = __ipv6_neigh_lookup(dev, daddr);
207 return neigh_create(&nd_tbl, daddr, dev);
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
221 struct net_device *dev = dst->dev;
222 struct rt6_info *rt = (struct rt6_info *)dst;
224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
231 __ipv6_confirm_neigh(dev, daddr);
234 static struct dst_ops ip6_dst_ops_template = {
238 .check = ip6_dst_check,
239 .default_advmss = ip6_default_advmss,
241 .cow_metrics = dst_cow_metrics_generic,
242 .destroy = ip6_dst_destroy,
243 .ifdown = ip6_dst_ifdown,
244 .negative_advice = ip6_negative_advice,
245 .link_failure = ip6_link_failure,
246 .update_pmtu = ip6_rt_update_pmtu,
247 .redirect = rt6_do_redirect,
248 .local_out = __ip6_local_out,
249 .neigh_lookup = ip6_dst_neigh_lookup,
250 .confirm_neigh = ip6_confirm_neigh,
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
257 return mtu ? : dst->dev->mtu;
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 struct sk_buff *skb, u32 mtu)
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
270 static struct dst_ops ip6_dst_blackhole_ops = {
272 .destroy = ip6_dst_destroy,
273 .check = ip6_dst_check,
274 .mtu = ip6_blackhole_mtu,
275 .default_advmss = ip6_default_advmss,
276 .update_pmtu = ip6_rt_blackhole_update_pmtu,
277 .redirect = ip6_rt_blackhole_redirect,
278 .cow_metrics = dst_cow_metrics_generic,
279 .neigh_lookup = ip6_dst_neigh_lookup,
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
286 static const struct rt6_info fib6_null_entry_template = {
287 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .rt6i_protocol = RTPROT_KERNEL,
289 .rt6i_metric = ~(u32)0,
290 .rt6i_ref = ATOMIC_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
295 static const struct rt6_info ip6_null_entry_template = {
297 .__refcnt = ATOMIC_INIT(1),
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
305 .rt6i_protocol = RTPROT_KERNEL,
306 .rt6i_metric = ~(u32) 0,
307 .rt6i_ref = ATOMIC_INIT(1),
308 .fib6_type = RTN_UNREACHABLE,
311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313 static const struct rt6_info ip6_prohibit_entry_template = {
315 .__refcnt = ATOMIC_INIT(1),
317 .obsolete = DST_OBSOLETE_FORCE_CHK,
319 .input = ip6_pkt_prohibit,
320 .output = ip6_pkt_prohibit_out,
322 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
323 .rt6i_protocol = RTPROT_KERNEL,
324 .rt6i_metric = ~(u32) 0,
325 .rt6i_ref = ATOMIC_INIT(1),
326 .fib6_type = RTN_PROHIBIT,
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
339 .rt6i_protocol = RTPROT_KERNEL,
340 .rt6i_metric = ~(u32) 0,
341 .rt6i_ref = ATOMIC_INIT(1),
342 .fib6_type = RTN_BLACKHOLE,
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
360 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
361 1, DST_OBSOLETE_FORCE_CHK, flags);
365 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
370 EXPORT_SYMBOL(ip6_dst_alloc);
372 static void ip6_dst_destroy(struct dst_entry *dst)
374 struct rt6_info *rt = (struct rt6_info *)dst;
375 struct rt6_info *from = rt->from;
376 struct inet6_dev *idev;
378 dst_destroy_metrics_generic(dst);
379 rt6_uncached_list_del(rt);
381 idev = rt->rt6i_idev;
383 rt->rt6i_idev = NULL;
388 fib6_info_release(from);
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
394 struct rt6_info *rt = (struct rt6_info *)dst;
395 struct inet6_dev *idev = rt->rt6i_idev;
396 struct net_device *loopback_dev =
397 dev_net(dev)->loopback_dev;
399 if (idev && idev->dev != loopback_dev) {
400 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
402 rt->rt6i_idev = loopback_idev;
408 static bool __rt6_check_expired(const struct rt6_info *rt)
410 if (rt->rt6i_flags & RTF_EXPIRES)
411 return time_after(jiffies, rt->dst.expires);
416 static bool rt6_check_expired(const struct rt6_info *rt)
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
421 } else if (rt->from) {
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(rt->from);
428 static struct rt6_info *rt6_multipath_select(const struct net *net,
429 struct rt6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct rt6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct rt6_info *rt6_device_match(struct net *net,
467 const struct in6_addr *saddr,
471 struct rt6_info *local = NULL;
472 struct rt6_info *sprt;
474 if (!oif && ipv6_addr_any(saddr) &&
475 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
479 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
485 if (dev->ifindex == oif)
487 if (dev->flags & IFF_LOOPBACK) {
488 if (!sprt->rt6i_idev ||
489 sprt->rt6i_idev->dev->ifindex != oif) {
490 if (flags & RT6_LOOKUP_F_IFACE)
493 local->rt6i_idev->dev->ifindex == oif)
499 if (ipv6_chk_addr(net, saddr, dev,
500 flags & RT6_LOOKUP_F_IFACE))
509 if (flags & RT6_LOOKUP_F_IFACE)
510 return net->ipv6.fib6_null_entry;
513 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518 struct work_struct work;
519 struct in6_addr target;
520 struct net_device *dev;
523 static void rt6_probe_deferred(struct work_struct *w)
525 struct in6_addr mcaddr;
526 struct __rt6_probe_work *work =
527 container_of(w, struct __rt6_probe_work, work);
529 addrconf_addr_solict_mult(&work->target, &mcaddr);
530 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
535 static void rt6_probe(struct rt6_info *rt)
537 struct __rt6_probe_work *work;
538 const struct in6_addr *nh_gw;
539 struct neighbour *neigh;
540 struct net_device *dev;
543 * Okay, this does not seem to be appropriate
544 * for now, however, we need to check if it
545 * is really so; aka Router Reachability Probing.
547 * Router Reachability Probe MUST be rate-limited
548 * to no more than one per minute.
550 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
553 nh_gw = &rt->fib6_nh.nh_gw;
554 dev = rt->fib6_nh.nh_dev;
556 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
558 if (neigh->nud_state & NUD_VALID)
562 write_lock(&neigh->lock);
563 if (!(neigh->nud_state & NUD_VALID) &&
566 rt->rt6i_idev->cnf.rtr_probe_interval)) {
567 work = kmalloc(sizeof(*work), GFP_ATOMIC);
569 __neigh_set_probe_once(neigh);
571 write_unlock(&neigh->lock);
573 work = kmalloc(sizeof(*work), GFP_ATOMIC);
577 INIT_WORK(&work->work, rt6_probe_deferred);
578 work->target = *nh_gw;
581 schedule_work(&work->work);
585 rcu_read_unlock_bh();
588 static inline void rt6_probe(struct rt6_info *rt)
594 * Default Router Selection (RFC 2461 6.3.6)
596 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
598 const struct net_device *dev = rt->fib6_nh.nh_dev;
600 if (!oif || dev->ifindex == oif)
602 if ((dev->flags & IFF_LOOPBACK) &&
603 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
610 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611 struct neighbour *neigh;
613 if (rt->rt6i_flags & RTF_NONEXTHOP ||
614 !(rt->rt6i_flags & RTF_GATEWAY))
615 return RT6_NUD_SUCCEED;
618 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
621 read_lock(&neigh->lock);
622 if (neigh->nud_state & NUD_VALID)
623 ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625 else if (!(neigh->nud_state & NUD_FAILED))
626 ret = RT6_NUD_SUCCEED;
628 ret = RT6_NUD_FAIL_PROBE;
630 read_unlock(&neigh->lock);
632 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
635 rcu_read_unlock_bh();
640 static int rt6_score_route(struct rt6_info *rt, int oif,
645 m = rt6_check_dev(rt, oif);
646 if (!m && (strict & RT6_LOOKUP_F_IFACE))
647 return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
651 if (strict & RT6_LOOKUP_F_REACHABLE) {
652 int n = rt6_check_neigh(rt);
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660 int *mpri, struct rt6_info *match,
664 bool match_do_rr = false;
665 struct inet6_dev *idev = rt->rt6i_idev;
667 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
670 if (idev->cnf.ignore_routes_with_linkdown &&
671 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
672 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
675 if (fib6_check_expired(rt))
678 m = rt6_score_route(rt, oif, strict);
679 if (m == RT6_NUD_FAIL_DO_RR) {
681 m = 0; /* lowest valid score */
682 } else if (m == RT6_NUD_FAIL_HARD) {
686 if (strict & RT6_LOOKUP_F_REACHABLE)
689 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
691 *do_rr = match_do_rr;
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700 struct rt6_info *leaf,
701 struct rt6_info *rr_head,
702 u32 metric, int oif, int strict,
705 struct rt6_info *rt, *match, *cont;
710 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
711 if (rt->rt6i_metric != metric) {
716 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719 for (rt = leaf; rt && rt != rr_head;
720 rt = rcu_dereference(rt->rt6_next)) {
721 if (rt->rt6i_metric != metric) {
726 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
733 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
741 struct rt6_info *leaf = rcu_dereference(fn->leaf);
742 struct rt6_info *match, *rt0;
746 if (!leaf || leaf == net->ipv6.fib6_null_entry)
747 return net->ipv6.fib6_null_entry;
749 rt0 = rcu_dereference(fn->rr_ptr);
753 /* Double check to make sure fn is not an intermediate node
754 * and fn->leaf does not points to its child's leaf
755 * (This might happen if all routes under fn are deleted from
756 * the tree and fib6_repair_tree() is called on the node.)
758 key_plen = rt0->rt6i_dst.plen;
759 #ifdef CONFIG_IPV6_SUBTREES
760 if (rt0->rt6i_src.plen)
761 key_plen = rt0->rt6i_src.plen;
763 if (fn->fn_bit != key_plen)
764 return net->ipv6.fib6_null_entry;
766 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
770 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
772 /* no entries matched; do round-robin */
773 if (!next || next->rt6i_metric != rt0->rt6i_metric)
777 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
778 /* make sure next is not being deleted from the tree */
780 rcu_assign_pointer(fn->rr_ptr, next);
781 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
785 return match ? match : net->ipv6.fib6_null_entry;
788 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
790 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
793 #ifdef CONFIG_IPV6_ROUTE_INFO
794 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
795 const struct in6_addr *gwaddr)
797 struct net *net = dev_net(dev);
798 struct route_info *rinfo = (struct route_info *) opt;
799 struct in6_addr prefix_buf, *prefix;
801 unsigned long lifetime;
804 if (len < sizeof(struct route_info)) {
808 /* Sanity check for prefix_len and length */
809 if (rinfo->length > 3) {
811 } else if (rinfo->prefix_len > 128) {
813 } else if (rinfo->prefix_len > 64) {
814 if (rinfo->length < 2) {
817 } else if (rinfo->prefix_len > 0) {
818 if (rinfo->length < 1) {
823 pref = rinfo->route_pref;
824 if (pref == ICMPV6_ROUTER_PREF_INVALID)
827 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
829 if (rinfo->length == 3)
830 prefix = (struct in6_addr *)rinfo->prefix;
832 /* this function is safe */
833 ipv6_addr_prefix(&prefix_buf,
834 (struct in6_addr *)rinfo->prefix,
836 prefix = &prefix_buf;
839 if (rinfo->prefix_len == 0)
840 rt = rt6_get_dflt_router(net, gwaddr, dev);
842 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
845 if (rt && !lifetime) {
851 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
854 rt->rt6i_flags = RTF_ROUTEINFO |
855 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
858 if (!addrconf_finite_timeout(lifetime))
859 fib6_clean_expires(rt);
861 fib6_set_expires(rt, jiffies + HZ * lifetime);
863 fib6_info_release(rt);
870 * Misc support functions
873 /* called with rcu_lock held */
874 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
876 struct net_device *dev = rt->fib6_nh.nh_dev;
878 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
879 /* for copies of local routes, dst->dev needs to be the
880 * device if it is a master device, the master device if
881 * device is enslaved, and the loopback as the default
883 if (netif_is_l3_slave(dev) &&
884 !rt6_need_strict(&rt->rt6i_dst.addr))
885 dev = l3mdev_master_dev_rcu(dev);
886 else if (!netif_is_l3_master(dev))
887 dev = dev_net(dev)->loopback_dev;
888 /* last case is netif_is_l3_master(dev) is true in which
889 * case we want dev returned to be dev
896 static const int fib6_prop[RTN_MAX + 1] = {
903 [RTN_BLACKHOLE] = -EINVAL,
904 [RTN_UNREACHABLE] = -EHOSTUNREACH,
905 [RTN_PROHIBIT] = -EACCES,
906 [RTN_THROW] = -EAGAIN,
908 [RTN_XRESOLVE] = -EINVAL,
911 static int ip6_rt_type_to_error(u8 fib6_type)
913 return fib6_prop[fib6_type];
916 static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
918 unsigned short flags = 0;
921 flags |= DST_NOCOUNT;
922 if (rt->dst_nopolicy)
923 flags |= DST_NOPOLICY;
930 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
932 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
934 switch (ort->fib6_type) {
936 rt->dst.output = dst_discard_out;
937 rt->dst.input = dst_discard;
940 rt->dst.output = ip6_pkt_prohibit_out;
941 rt->dst.input = ip6_pkt_prohibit;
944 case RTN_UNREACHABLE:
946 rt->dst.output = ip6_pkt_discard_out;
947 rt->dst.input = ip6_pkt_discard;
952 static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
954 rt->dst.flags |= fib6_info_dst_flags(ort);
956 if (ort->rt6i_flags & RTF_REJECT) {
957 ip6_rt_init_dst_reject(rt, ort);
962 rt->dst.output = ip6_output;
964 if (ort->fib6_type == RTN_LOCAL) {
965 rt->dst.input = ip6_input;
966 } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
967 rt->dst.input = ip6_mc_input;
969 rt->dst.input = ip6_forward;
972 if (ort->fib6_nh.nh_lwtstate) {
973 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
974 lwtunnel_set_redirect(&rt->dst);
977 rt->dst.lastuse = jiffies;
980 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
982 rt->rt6i_flags &= ~RTF_EXPIRES;
983 fib6_info_hold(from);
985 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
986 if (from->fib6_metrics != &dst_default_metrics) {
987 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
988 refcount_inc(&from->fib6_metrics->refcnt);
992 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
994 ip6_rt_init_dst(rt, ort);
996 rt->rt6i_dst = ort->rt6i_dst;
997 rt->rt6i_idev = ort->rt6i_idev;
999 in6_dev_hold(rt->rt6i_idev);
1000 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1001 rt->rt6i_flags = ort->rt6i_flags;
1002 rt6_set_from(rt, ort);
1003 rt->rt6i_metric = ort->rt6i_metric;
1004 #ifdef CONFIG_IPV6_SUBTREES
1005 rt->rt6i_src = ort->rt6i_src;
1007 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
1008 rt->rt6i_table = ort->rt6i_table;
1009 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1012 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1013 struct in6_addr *saddr)
1015 struct fib6_node *pn, *sn;
1017 if (fn->fn_flags & RTN_TL_ROOT)
1019 pn = rcu_dereference(fn->parent);
1020 sn = FIB6_SUBTREE(pn);
1022 fn = fib6_lookup(sn, NULL, saddr);
1025 if (fn->fn_flags & RTN_RTINFO)
1030 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1033 struct rt6_info *rt = *prt;
1035 if (dst_hold_safe(&rt->dst))
1037 if (null_fallback) {
1038 rt = net->ipv6.ip6_null_entry;
1047 /* called with rcu_lock held */
1048 static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
1050 unsigned short flags = fib6_info_dst_flags(rt);
1051 struct net_device *dev = rt->fib6_nh.nh_dev;
1052 struct rt6_info *nrt;
1054 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1056 ip6_rt_copy_init(nrt, rt);
1061 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1062 struct fib6_table *table,
1064 const struct sk_buff *skb,
1067 struct rt6_info *f6i;
1068 struct fib6_node *fn;
1069 struct rt6_info *rt;
1071 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1072 flags &= ~RT6_LOOKUP_F_IFACE;
1075 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1077 f6i = rcu_dereference(fn->leaf);
1079 f6i = net->ipv6.fib6_null_entry;
1081 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1082 fl6->flowi6_oif, flags);
1083 if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
1084 f6i = rt6_multipath_select(net, f6i, fl6,
1085 fl6->flowi6_oif, skb, flags);
1087 if (f6i == net->ipv6.fib6_null_entry) {
1088 fn = fib6_backtrack(fn, &fl6->saddr);
1093 /* Search through exception table */
1094 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1096 if (ip6_hold_safe(net, &rt, true))
1097 dst_use_noref(&rt->dst, jiffies);
1098 } else if (f6i == net->ipv6.fib6_null_entry) {
1099 rt = net->ipv6.ip6_null_entry;
1102 rt = ip6_create_rt_rcu(f6i);
1104 rt = net->ipv6.ip6_null_entry;
1111 trace_fib6_table_lookup(net, rt, table, fl6);
1116 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1117 const struct sk_buff *skb, int flags)
1119 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1121 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1123 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1124 const struct in6_addr *saddr, int oif,
1125 const struct sk_buff *skb, int strict)
1127 struct flowi6 fl6 = {
1131 struct dst_entry *dst;
1132 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1135 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136 flags |= RT6_LOOKUP_F_HAS_SADDR;
1139 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1140 if (dst->error == 0)
1141 return (struct rt6_info *) dst;
1147 EXPORT_SYMBOL(rt6_lookup);
1149 /* ip6_ins_rt is called with FREE table->tb6_lock.
1150 * It takes new route entry, the addition fails by any reason the
1151 * route is released.
1152 * Caller must hold dst before calling it.
1155 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1156 struct netlink_ext_ack *extack)
1159 struct fib6_table *table;
1161 table = rt->rt6i_table;
1162 spin_lock_bh(&table->tb6_lock);
1163 err = fib6_add(&table->tb6_root, rt, info, extack);
1164 spin_unlock_bh(&table->tb6_lock);
1169 int ip6_ins_rt(struct net *net, struct rt6_info *rt)
1171 struct nl_info info = { .nl_net = net, };
1173 return __ip6_ins_rt(rt, &info, NULL);
1176 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1177 const struct in6_addr *daddr,
1178 const struct in6_addr *saddr)
1180 struct net_device *dev;
1181 struct rt6_info *rt;
1188 dev = ip6_rt_get_dev_rcu(ort);
1189 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1194 ip6_rt_copy_init(rt, ort);
1195 rt->rt6i_flags |= RTF_CACHE;
1196 rt->rt6i_metric = 0;
1197 rt->dst.flags |= DST_HOST;
1198 rt->rt6i_dst.addr = *daddr;
1199 rt->rt6i_dst.plen = 128;
1201 if (!rt6_is_gw_or_nonexthop(ort)) {
1202 if (ort->rt6i_dst.plen != 128 &&
1203 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1204 rt->rt6i_flags |= RTF_ANYCAST;
1205 #ifdef CONFIG_IPV6_SUBTREES
1206 if (rt->rt6i_src.plen && saddr) {
1207 rt->rt6i_src.addr = *saddr;
1208 rt->rt6i_src.plen = 128;
1216 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1218 unsigned short flags = fib6_info_dst_flags(rt);
1219 struct net_device *dev;
1220 struct rt6_info *pcpu_rt;
1223 dev = ip6_rt_get_dev_rcu(rt);
1224 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1228 ip6_rt_copy_init(pcpu_rt, rt);
1229 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1230 pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1237 struct rt6_info *pcpu_rt, **p;
1239 p = this_cpu_ptr(rt->rt6i_pcpu);
1243 ip6_hold_safe(NULL, &pcpu_rt, false);
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249 struct rt6_info *rt)
1251 struct rt6_info *pcpu_rt, *prev, **p;
1253 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256 return net->ipv6.ip6_null_entry;
1259 dst_hold(&pcpu_rt->dst);
1260 p = this_cpu_ptr(rt->rt6i_pcpu);
1261 prev = cmpxchg(p, NULL, pcpu_rt);
1267 /* exception hash table implementation
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1271 /* Remove rt6_ex from hash table and free the memory
1272 * Caller must hold rt6_exception_lock
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275 struct rt6_exception *rt6_ex)
1279 if (!bucket || !rt6_ex)
1282 net = dev_net(rt6_ex->rt6i->dst.dev);
1283 rt6_ex->rt6i->rt6i_node = NULL;
1284 hlist_del_rcu(&rt6_ex->hlist);
1285 ip6_rt_put(rt6_ex->rt6i);
1286 kfree_rcu(rt6_ex, rcu);
1287 WARN_ON_ONCE(!bucket->depth);
1289 net->ipv6.rt6_stats->fib_rt_cache--;
1292 /* Remove oldest rt6_ex in bucket and free the memory
1293 * Caller must hold rt6_exception_lock
1295 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1297 struct rt6_exception *rt6_ex, *oldest = NULL;
1302 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1303 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1306 rt6_remove_exception(bucket, oldest);
1309 static u32 rt6_exception_hash(const struct in6_addr *dst,
1310 const struct in6_addr *src)
1312 static u32 seed __read_mostly;
1315 net_get_random_once(&seed, sizeof(seed));
1316 val = jhash(dst, sizeof(*dst), seed);
1318 #ifdef CONFIG_IPV6_SUBTREES
1320 val = jhash(src, sizeof(*src), val);
1322 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1325 /* Helper function to find the cached rt in the hash table
1326 * and update bucket pointer to point to the bucket for this
1327 * (daddr, saddr) pair
1328 * Caller must hold rt6_exception_lock
1330 static struct rt6_exception *
1331 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1332 const struct in6_addr *daddr,
1333 const struct in6_addr *saddr)
1335 struct rt6_exception *rt6_ex;
1338 if (!(*bucket) || !daddr)
1341 hval = rt6_exception_hash(daddr, saddr);
1344 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1345 struct rt6_info *rt6 = rt6_ex->rt6i;
1346 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1348 #ifdef CONFIG_IPV6_SUBTREES
1349 if (matched && saddr)
1350 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1358 /* Helper function to find the cached rt in the hash table
1359 * and update bucket pointer to point to the bucket for this
1360 * (daddr, saddr) pair
1361 * Caller must hold rcu_read_lock()
1363 static struct rt6_exception *
1364 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1365 const struct in6_addr *daddr,
1366 const struct in6_addr *saddr)
1368 struct rt6_exception *rt6_ex;
1371 WARN_ON_ONCE(!rcu_read_lock_held());
1373 if (!(*bucket) || !daddr)
1376 hval = rt6_exception_hash(daddr, saddr);
1379 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1380 struct rt6_info *rt6 = rt6_ex->rt6i;
1381 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1383 #ifdef CONFIG_IPV6_SUBTREES
1384 if (matched && saddr)
1385 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 static unsigned int fib6_mtu(const struct rt6_info *rt)
1397 mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404 struct rt6_info *ort)
1406 struct net *net = dev_net(nrt->dst.dev);
1407 struct rt6_exception_bucket *bucket;
1408 struct in6_addr *src_key = NULL;
1409 struct rt6_exception *rt6_ex;
1412 spin_lock_bh(&rt6_exception_lock);
1414 if (ort->exception_bucket_flushed) {
1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420 lockdep_is_held(&rt6_exception_lock));
1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1431 #ifdef CONFIG_IPV6_SUBTREES
1432 /* rt6i_src.plen != 0 indicates ort is in subtree
1433 * and exception table is indexed by a hash of
1434 * both rt6i_dst and rt6i_src.
1435 * Otherwise, the exception table is indexed by
1436 * a hash of only rt6i_dst.
1438 if (ort->rt6i_src.plen)
1439 src_key = &nrt->rt6i_src.addr;
1442 /* Update rt6i_prefsrc as it could be changed
1443 * in rt6_remove_prefsrc()
1445 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1446 /* rt6_mtu_change() might lower mtu on ort.
1447 * Only insert this exception route if its mtu
1448 * is less than ort's mtu value.
1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1458 rt6_remove_exception(bucket, rt6_ex);
1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466 rt6_ex->stamp = jiffies;
1467 atomic_inc(&nrt->rt6i_ref);
1468 nrt->rt6i_node = ort->rt6i_node;
1469 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1471 net->ipv6.rt6_stats->fib_rt_cache++;
1473 if (bucket->depth > FIB6_MAX_DEPTH)
1474 rt6_exception_remove_oldest(bucket);
1477 spin_unlock_bh(&rt6_exception_lock);
1479 /* Update fn->fn_sernum to invalidate all cached dst */
1481 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1482 fib6_update_sernum(net, ort);
1483 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1484 fib6_force_start_gc(net);
1490 void rt6_flush_exceptions(struct rt6_info *rt)
1492 struct rt6_exception_bucket *bucket;
1493 struct rt6_exception *rt6_ex;
1494 struct hlist_node *tmp;
1497 spin_lock_bh(&rt6_exception_lock);
1498 /* Prevent rt6_insert_exception() to recreate the bucket list */
1499 rt->exception_bucket_flushed = 1;
1501 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1502 lockdep_is_held(&rt6_exception_lock));
1506 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1507 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1508 rt6_remove_exception(bucket, rt6_ex);
1509 WARN_ON_ONCE(bucket->depth);
1514 spin_unlock_bh(&rt6_exception_lock);
1517 /* Find cached rt in the hash table inside passed in rt
1518 * Caller has to hold rcu_read_lock()
1520 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1521 struct in6_addr *daddr,
1522 struct in6_addr *saddr)
1524 struct rt6_exception_bucket *bucket;
1525 struct in6_addr *src_key = NULL;
1526 struct rt6_exception *rt6_ex;
1527 struct rt6_info *res = NULL;
1529 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1531 #ifdef CONFIG_IPV6_SUBTREES
1532 /* rt6i_src.plen != 0 indicates rt is in subtree
1533 * and exception table is indexed by a hash of
1534 * both rt6i_dst and rt6i_src.
1535 * Otherwise, the exception table is indexed by
1536 * a hash of only rt6i_dst.
1538 if (rt->rt6i_src.plen)
1541 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1543 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1549 /* Remove the passed in cached rt from the hash table that contains it */
1550 static int rt6_remove_exception_rt(struct rt6_info *rt)
1552 struct rt6_exception_bucket *bucket;
1553 struct rt6_info *from = rt->from;
1554 struct in6_addr *src_key = NULL;
1555 struct rt6_exception *rt6_ex;
1559 !(rt->rt6i_flags & RTF_CACHE))
1562 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1565 spin_lock_bh(&rt6_exception_lock);
1566 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1567 lockdep_is_held(&rt6_exception_lock));
1568 #ifdef CONFIG_IPV6_SUBTREES
1569 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1570 * and exception table is indexed by a hash of
1571 * both rt6i_dst and rt6i_src.
1572 * Otherwise, the exception table is indexed by
1573 * a hash of only rt6i_dst.
1575 if (from->rt6i_src.plen)
1576 src_key = &rt->rt6i_src.addr;
1578 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1582 rt6_remove_exception(bucket, rt6_ex);
1588 spin_unlock_bh(&rt6_exception_lock);
1592 /* Find rt6_ex which contains the passed in rt cache and
1595 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1597 struct rt6_exception_bucket *bucket;
1598 struct rt6_info *from = rt->from;
1599 struct in6_addr *src_key = NULL;
1600 struct rt6_exception *rt6_ex;
1603 !(rt->rt6i_flags & RTF_CACHE))
1607 bucket = rcu_dereference(from->rt6i_exception_bucket);
1609 #ifdef CONFIG_IPV6_SUBTREES
1610 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1611 * and exception table is indexed by a hash of
1612 * both rt6i_dst and rt6i_src.
1613 * Otherwise, the exception table is indexed by
1614 * a hash of only rt6i_dst.
1616 if (from->rt6i_src.plen)
1617 src_key = &rt->rt6i_src.addr;
1619 rt6_ex = __rt6_find_exception_rcu(&bucket,
1623 rt6_ex->stamp = jiffies;
1628 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1630 struct rt6_exception_bucket *bucket;
1631 struct rt6_exception *rt6_ex;
1634 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1635 lockdep_is_held(&rt6_exception_lock));
1638 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1639 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1640 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648 struct rt6_info *rt, int mtu)
1650 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1651 * lowest MTU in the path: always allow updating the route PMTU to
1652 * reflect PMTU decreases.
1654 * If the new MTU is higher, and the route PMTU is equal to the local
1655 * MTU, this means the old MTU is the lowest in the path, so allow
1656 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1660 if (dst_mtu(&rt->dst) >= mtu)
1663 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670 struct rt6_info *rt, int mtu)
1672 struct rt6_exception_bucket *bucket;
1673 struct rt6_exception *rt6_ex;
1676 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677 lockdep_is_held(&rt6_exception_lock));
1682 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684 struct rt6_info *entry = rt6_ex->rt6i;
1686 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687 * route), the metrics of its rt->from have already
1690 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691 rt6_mtu_change_route_allowed(idev, entry, mtu))
1692 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1698 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1700 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1701 struct in6_addr *gateway)
1703 struct rt6_exception_bucket *bucket;
1704 struct rt6_exception *rt6_ex;
1705 struct hlist_node *tmp;
1708 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1711 spin_lock_bh(&rt6_exception_lock);
1712 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713 lockdep_is_held(&rt6_exception_lock));
1716 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717 hlist_for_each_entry_safe(rt6_ex, tmp,
1718 &bucket->chain, hlist) {
1719 struct rt6_info *entry = rt6_ex->rt6i;
1721 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722 RTF_CACHE_GATEWAY &&
1723 ipv6_addr_equal(gateway,
1724 &entry->rt6i_gateway)) {
1725 rt6_remove_exception(bucket, rt6_ex);
1732 spin_unlock_bh(&rt6_exception_lock);
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736 struct rt6_exception *rt6_ex,
1737 struct fib6_gc_args *gc_args,
1740 struct rt6_info *rt = rt6_ex->rt6i;
1742 /* we are pruning and obsoleting aged-out and non gateway exceptions
1743 * even if others have still references to them, so that on next
1744 * dst_check() such references can be dropped.
1745 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746 * expired, independently from their aging, as per RFC 8201 section 4
1748 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750 RT6_TRACE("aging clone %p\n", rt);
1751 rt6_remove_exception(bucket, rt6_ex);
1754 } else if (time_after(jiffies, rt->dst.expires)) {
1755 RT6_TRACE("purging expired route %p\n", rt);
1756 rt6_remove_exception(bucket, rt6_ex);
1760 if (rt->rt6i_flags & RTF_GATEWAY) {
1761 struct neighbour *neigh;
1762 __u8 neigh_flags = 0;
1764 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1766 neigh_flags = neigh->flags;
1768 if (!(neigh_flags & NTF_ROUTER)) {
1769 RT6_TRACE("purging route %p via non-router but gateway\n",
1771 rt6_remove_exception(bucket, rt6_ex);
1779 void rt6_age_exceptions(struct rt6_info *rt,
1780 struct fib6_gc_args *gc_args,
1783 struct rt6_exception_bucket *bucket;
1784 struct rt6_exception *rt6_ex;
1785 struct hlist_node *tmp;
1788 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1792 spin_lock(&rt6_exception_lock);
1793 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794 lockdep_is_held(&rt6_exception_lock));
1797 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798 hlist_for_each_entry_safe(rt6_ex, tmp,
1799 &bucket->chain, hlist) {
1800 rt6_age_examine_exception(bucket, rt6_ex,
1806 spin_unlock(&rt6_exception_lock);
1807 rcu_read_unlock_bh();
1810 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1811 int oif, struct flowi6 *fl6,
1812 const struct sk_buff *skb, int flags)
1814 struct fib6_node *fn, *saved_fn;
1815 struct rt6_info *f6i;
1816 struct rt6_info *rt;
1819 strict |= flags & RT6_LOOKUP_F_IFACE;
1820 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1821 if (net->ipv6.devconf_all->forwarding == 0)
1822 strict |= RT6_LOOKUP_F_REACHABLE;
1826 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1829 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1833 f6i = rt6_select(net, fn, oif, strict);
1834 if (f6i->rt6i_nsiblings)
1835 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1836 if (f6i == net->ipv6.fib6_null_entry) {
1837 fn = fib6_backtrack(fn, &fl6->saddr);
1839 goto redo_rt6_select;
1840 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1841 /* also consider unreachable route */
1842 strict &= ~RT6_LOOKUP_F_REACHABLE;
1844 goto redo_rt6_select;
1848 if (f6i == net->ipv6.fib6_null_entry) {
1849 rt = net->ipv6.ip6_null_entry;
1852 trace_fib6_table_lookup(net, rt, table, fl6);
1856 /*Search through exception table */
1857 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1859 if (ip6_hold_safe(net, &rt, true))
1860 dst_use_noref(&rt->dst, jiffies);
1863 trace_fib6_table_lookup(net, rt, table, fl6);
1865 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1866 !(f6i->rt6i_flags & RTF_GATEWAY))) {
1867 /* Create a RTF_CACHE clone which will not be
1868 * owned by the fib6 tree. It is for the special case where
1869 * the daddr in the skb during the neighbor look-up is different
1870 * from the fl6->daddr used to look-up route here.
1873 struct rt6_info *uncached_rt;
1875 fib6_info_hold(f6i);
1878 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1879 fib6_info_release(f6i);
1882 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1883 * No need for another dst_hold()
1885 rt6_uncached_list_add(uncached_rt);
1886 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1888 uncached_rt = net->ipv6.ip6_null_entry;
1889 dst_hold(&uncached_rt->dst);
1892 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1896 /* Get a percpu copy */
1898 struct rt6_info *pcpu_rt;
1901 pcpu_rt = rt6_get_pcpu_route(f6i);
1904 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1908 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1912 EXPORT_SYMBOL_GPL(ip6_pol_route);
1914 static struct rt6_info *ip6_pol_route_input(struct net *net,
1915 struct fib6_table *table,
1917 const struct sk_buff *skb,
1920 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1923 struct dst_entry *ip6_route_input_lookup(struct net *net,
1924 struct net_device *dev,
1926 const struct sk_buff *skb,
1929 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1930 flags |= RT6_LOOKUP_F_IFACE;
1932 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1934 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1936 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1937 struct flow_keys *keys,
1938 struct flow_keys *flkeys)
1940 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1941 const struct ipv6hdr *key_iph = outer_iph;
1942 struct flow_keys *_flkeys = flkeys;
1943 const struct ipv6hdr *inner_iph;
1944 const struct icmp6hdr *icmph;
1945 struct ipv6hdr _inner_iph;
1947 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1950 icmph = icmp6_hdr(skb);
1951 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1952 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1953 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1954 icmph->icmp6_type != ICMPV6_PARAMPROB)
1957 inner_iph = skb_header_pointer(skb,
1958 skb_transport_offset(skb) + sizeof(*icmph),
1959 sizeof(_inner_iph), &_inner_iph);
1963 key_iph = inner_iph;
1967 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1968 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1969 keys->tags.flow_label = _flkeys->tags.flow_label;
1970 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1972 keys->addrs.v6addrs.src = key_iph->saddr;
1973 keys->addrs.v6addrs.dst = key_iph->daddr;
1974 keys->tags.flow_label = ip6_flowinfo(key_iph);
1975 keys->basic.ip_proto = key_iph->nexthdr;
1979 /* if skb is set it will be used and fl6 can be NULL */
1980 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1981 const struct sk_buff *skb, struct flow_keys *flkeys)
1983 struct flow_keys hash_keys;
1986 switch (ip6_multipath_hash_policy(net)) {
1988 memset(&hash_keys, 0, sizeof(hash_keys));
1989 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1991 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1993 hash_keys.addrs.v6addrs.src = fl6->saddr;
1994 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1995 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1996 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2001 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2002 struct flow_keys keys;
2004 /* short-circuit if we already have L4 hash present */
2006 return skb_get_hash_raw(skb) >> 1;
2008 memset(&hash_keys, 0, sizeof(hash_keys));
2011 skb_flow_dissect_flow_keys(skb, &keys, flag);
2014 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2015 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2016 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2017 hash_keys.ports.src = flkeys->ports.src;
2018 hash_keys.ports.dst = flkeys->ports.dst;
2019 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2021 memset(&hash_keys, 0, sizeof(hash_keys));
2022 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2023 hash_keys.addrs.v6addrs.src = fl6->saddr;
2024 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2025 hash_keys.ports.src = fl6->fl6_sport;
2026 hash_keys.ports.dst = fl6->fl6_dport;
2027 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2031 mhash = flow_hash_from_keys(&hash_keys);
2036 void ip6_route_input(struct sk_buff *skb)
2038 const struct ipv6hdr *iph = ipv6_hdr(skb);
2039 struct net *net = dev_net(skb->dev);
2040 int flags = RT6_LOOKUP_F_HAS_SADDR;
2041 struct ip_tunnel_info *tun_info;
2042 struct flowi6 fl6 = {
2043 .flowi6_iif = skb->dev->ifindex,
2044 .daddr = iph->daddr,
2045 .saddr = iph->saddr,
2046 .flowlabel = ip6_flowinfo(iph),
2047 .flowi6_mark = skb->mark,
2048 .flowi6_proto = iph->nexthdr,
2050 struct flow_keys *flkeys = NULL, _flkeys;
2052 tun_info = skb_tunnel_info(skb);
2053 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2054 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2056 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2059 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2060 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2063 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2066 static struct rt6_info *ip6_pol_route_output(struct net *net,
2067 struct fib6_table *table,
2069 const struct sk_buff *skb,
2072 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2075 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2076 struct flowi6 *fl6, int flags)
2080 if (rt6_need_strict(&fl6->daddr)) {
2081 struct dst_entry *dst;
2083 dst = l3mdev_link_scope_lookup(net, fl6);
2088 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2090 any_src = ipv6_addr_any(&fl6->saddr);
2091 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2092 (fl6->flowi6_oif && any_src))
2093 flags |= RT6_LOOKUP_F_IFACE;
2096 flags |= RT6_LOOKUP_F_HAS_SADDR;
2098 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2100 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2102 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2104 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2106 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2107 struct net_device *loopback_dev = net->loopback_dev;
2108 struct dst_entry *new = NULL;
2110 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2111 DST_OBSOLETE_DEAD, 0);
2114 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2118 new->input = dst_discard;
2119 new->output = dst_discard_out;
2121 dst_copy_metrics(new, &ort->dst);
2123 rt->rt6i_idev = in6_dev_get(loopback_dev);
2124 rt->rt6i_gateway = ort->rt6i_gateway;
2125 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2126 rt->rt6i_metric = 0;
2128 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2129 #ifdef CONFIG_IPV6_SUBTREES
2130 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2134 dst_release(dst_orig);
2135 return new ? new : ERR_PTR(-ENOMEM);
2139 * Destination cache support functions
2142 static bool fib6_check(struct rt6_info *f6i, u32 cookie)
2146 if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
2147 rt_cookie != cookie)
2150 if (fib6_check_expired(f6i))
2156 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2160 if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
2161 rt_cookie != cookie)
2164 if (rt6_check_expired(rt))
2170 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2172 if (!__rt6_check_expired(rt) &&
2173 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174 fib6_check(rt->from, cookie))
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2182 struct rt6_info *rt;
2184 rt = (struct rt6_info *) dst;
2186 /* All IPV6 dsts are created with ->obsolete set to the value
2187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2188 * into this function always.
2191 if (rt->rt6i_flags & RTF_PCPU ||
2192 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2193 return rt6_dst_from_check(rt, cookie);
2195 return rt6_check(rt, cookie);
2198 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2200 struct rt6_info *rt = (struct rt6_info *) dst;
2203 if (rt->rt6i_flags & RTF_CACHE) {
2204 if (rt6_check_expired(rt)) {
2205 rt6_remove_exception_rt(rt);
2216 static void ip6_link_failure(struct sk_buff *skb)
2218 struct rt6_info *rt;
2220 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2222 rt = (struct rt6_info *) skb_dst(skb);
2224 if (rt->rt6i_flags & RTF_CACHE) {
2225 if (dst_hold_safe(&rt->dst))
2226 rt6_remove_exception_rt(rt);
2227 } else if (rt->from) {
2228 struct fib6_node *fn;
2231 fn = rcu_dereference(rt->from->rt6i_node);
2232 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2239 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2241 struct net *net = dev_net(rt->dst.dev);
2243 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2244 rt->rt6i_flags |= RTF_MODIFIED;
2245 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2248 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2250 return !(rt->rt6i_flags & RTF_CACHE) &&
2251 (rt->rt6i_flags & RTF_PCPU ||
2252 rcu_access_pointer(rt->rt6i_node));
2255 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2256 const struct ipv6hdr *iph, u32 mtu)
2258 const struct in6_addr *daddr, *saddr;
2259 struct rt6_info *rt6 = (struct rt6_info *)dst;
2261 if (rt6->rt6i_flags & RTF_LOCAL)
2264 if (dst_metric_locked(dst, RTAX_MTU))
2268 daddr = &iph->daddr;
2269 saddr = &iph->saddr;
2271 daddr = &sk->sk_v6_daddr;
2272 saddr = &inet6_sk(sk)->saddr;
2277 dst_confirm_neigh(dst, daddr);
2278 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2279 if (mtu >= dst_mtu(dst))
2282 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2283 rt6_do_update_pmtu(rt6, mtu);
2284 /* update rt6_ex->stamp for cache */
2285 if (rt6->rt6i_flags & RTF_CACHE)
2286 rt6_update_exception_stamp_rt(rt6);
2288 struct rt6_info *nrt6;
2290 nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2292 rt6_do_update_pmtu(nrt6, mtu);
2293 if (rt6_insert_exception(nrt6, rt6->from))
2294 dst_release_immediate(&nrt6->dst);
2299 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2300 struct sk_buff *skb, u32 mtu)
2302 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2305 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2306 int oif, u32 mark, kuid_t uid)
2308 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2309 struct dst_entry *dst;
2312 memset(&fl6, 0, sizeof(fl6));
2313 fl6.flowi6_oif = oif;
2314 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2315 fl6.daddr = iph->daddr;
2316 fl6.saddr = iph->saddr;
2317 fl6.flowlabel = ip6_flowinfo(iph);
2318 fl6.flowi6_uid = uid;
2320 dst = ip6_route_output(net, NULL, &fl6);
2322 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2325 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2327 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2329 struct dst_entry *dst;
2331 ip6_update_pmtu(skb, sock_net(sk), mtu,
2332 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2334 dst = __sk_dst_get(sk);
2335 if (!dst || !dst->obsolete ||
2336 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2340 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2341 ip6_datagram_dst_update(sk, false);
2344 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2346 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2347 const struct flowi6 *fl6)
2349 #ifdef CONFIG_IPV6_SUBTREES
2350 struct ipv6_pinfo *np = inet6_sk(sk);
2353 ip6_dst_store(sk, dst,
2354 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2355 &sk->sk_v6_daddr : NULL,
2356 #ifdef CONFIG_IPV6_SUBTREES
2357 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2363 /* Handle redirects */
2364 struct ip6rd_flowi {
2366 struct in6_addr gateway;
2369 static struct rt6_info *__ip6_route_redirect(struct net *net,
2370 struct fib6_table *table,
2372 const struct sk_buff *skb,
2375 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2376 struct rt6_info *ret = NULL, *rt_cache;
2377 struct rt6_info *rt;
2378 struct fib6_node *fn;
2380 /* Get the "current" route for this destination and
2381 * check if the redirect has come from appropriate router.
2383 * RFC 4861 specifies that redirects should only be
2384 * accepted if they come from the nexthop to the target.
2385 * Due to the way the routes are chosen, this notion
2386 * is a bit fuzzy and one might need to check all possible
2391 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2393 for_each_fib6_node_rt_rcu(fn) {
2394 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2396 if (fib6_check_expired(rt))
2398 if (rt->rt6i_flags & RTF_REJECT)
2400 if (!(rt->rt6i_flags & RTF_GATEWAY))
2402 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2404 /* rt_cache's gateway might be different from its 'parent'
2405 * in the case of an ip redirect.
2406 * So we keep searching in the exception table if the gateway
2409 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2410 rt_cache = rt6_find_cached_rt(rt,
2414 ipv6_addr_equal(&rdfl->gateway,
2415 &rt_cache->rt6i_gateway)) {
2425 rt = net->ipv6.fib6_null_entry;
2426 else if (rt->rt6i_flags & RTF_REJECT) {
2427 ret = net->ipv6.ip6_null_entry;
2431 if (rt == net->ipv6.fib6_null_entry) {
2432 fn = fib6_backtrack(fn, &fl6->saddr);
2439 dst_hold(&ret->dst);
2441 ret = ip6_create_rt_rcu(rt);
2445 trace_fib6_table_lookup(net, ret, table, fl6);
2449 static struct dst_entry *ip6_route_redirect(struct net *net,
2450 const struct flowi6 *fl6,
2451 const struct sk_buff *skb,
2452 const struct in6_addr *gateway)
2454 int flags = RT6_LOOKUP_F_HAS_SADDR;
2455 struct ip6rd_flowi rdfl;
2458 rdfl.gateway = *gateway;
2460 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2461 flags, __ip6_route_redirect);
2464 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2467 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2468 struct dst_entry *dst;
2471 memset(&fl6, 0, sizeof(fl6));
2472 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2473 fl6.flowi6_oif = oif;
2474 fl6.flowi6_mark = mark;
2475 fl6.daddr = iph->daddr;
2476 fl6.saddr = iph->saddr;
2477 fl6.flowlabel = ip6_flowinfo(iph);
2478 fl6.flowi6_uid = uid;
2480 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2481 rt6_do_redirect(dst, NULL, skb);
2484 EXPORT_SYMBOL_GPL(ip6_redirect);
2486 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2489 const struct ipv6hdr *iph = ipv6_hdr(skb);
2490 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2491 struct dst_entry *dst;
2494 memset(&fl6, 0, sizeof(fl6));
2495 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2496 fl6.flowi6_oif = oif;
2497 fl6.flowi6_mark = mark;
2498 fl6.daddr = msg->dest;
2499 fl6.saddr = iph->daddr;
2500 fl6.flowi6_uid = sock_net_uid(net, NULL);
2502 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2503 rt6_do_redirect(dst, NULL, skb);
2507 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2509 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2512 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2514 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2516 struct net_device *dev = dst->dev;
2517 unsigned int mtu = dst_mtu(dst);
2518 struct net *net = dev_net(dev);
2520 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2522 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2523 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2526 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2527 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2528 * IPV6_MAXPLEN is also valid and means: "any MSS,
2529 * rely only on pmtu discovery"
2531 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2536 static unsigned int ip6_mtu(const struct dst_entry *dst)
2538 struct inet6_dev *idev;
2541 mtu = dst_metric_raw(dst, RTAX_MTU);
2548 idev = __in6_dev_get(dst->dev);
2550 mtu = idev->cnf.mtu6;
2554 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2556 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2559 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2562 struct dst_entry *dst;
2563 struct rt6_info *rt;
2564 struct inet6_dev *idev = in6_dev_get(dev);
2565 struct net *net = dev_net(dev);
2567 if (unlikely(!idev))
2568 return ERR_PTR(-ENODEV);
2570 rt = ip6_dst_alloc(net, dev, 0);
2571 if (unlikely(!rt)) {
2573 dst = ERR_PTR(-ENOMEM);
2577 rt->dst.flags |= DST_HOST;
2578 rt->dst.input = ip6_input;
2579 rt->dst.output = ip6_output;
2580 rt->rt6i_gateway = fl6->daddr;
2581 rt->rt6i_dst.addr = fl6->daddr;
2582 rt->rt6i_dst.plen = 128;
2583 rt->rt6i_idev = idev;
2584 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2586 /* Add this dst into uncached_list so that rt6_disable_ip() can
2587 * do proper release of the net_device
2589 rt6_uncached_list_add(rt);
2590 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2592 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2598 static int ip6_dst_gc(struct dst_ops *ops)
2600 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2601 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2602 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2603 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2604 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2605 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2608 entries = dst_entries_get_fast(ops);
2609 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2610 entries <= rt_max_size)
2613 net->ipv6.ip6_rt_gc_expire++;
2614 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2615 entries = dst_entries_get_slow(ops);
2616 if (entries < ops->gc_thresh)
2617 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2619 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2620 return entries > rt_max_size;
2623 static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
2624 struct fib6_config *cfg)
2629 rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
2631 if (unlikely(!rt->fib6_metrics))
2634 refcount_set(&rt->fib6_metrics->refcnt, 1);
2636 err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
2637 rt->fib6_metrics->metrics);
2643 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2644 struct fib6_config *cfg,
2645 const struct in6_addr *gw_addr,
2646 u32 tbid, int flags)
2648 struct flowi6 fl6 = {
2649 .flowi6_oif = cfg->fc_ifindex,
2651 .saddr = cfg->fc_prefsrc,
2653 struct fib6_table *table;
2654 struct rt6_info *rt;
2656 table = fib6_get_table(net, tbid);
2660 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2661 flags |= RT6_LOOKUP_F_HAS_SADDR;
2663 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2664 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2666 /* if table lookup failed, fall back to full lookup */
2667 if (rt == net->ipv6.ip6_null_entry) {
2675 static int ip6_route_check_nh_onlink(struct net *net,
2676 struct fib6_config *cfg,
2677 const struct net_device *dev,
2678 struct netlink_ext_ack *extack)
2680 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2681 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2682 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2683 struct rt6_info *grt;
2687 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2689 if (!grt->dst.error &&
2690 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2691 NL_SET_ERR_MSG(extack,
2692 "Nexthop has invalid gateway or device mismatch");
2702 static int ip6_route_check_nh(struct net *net,
2703 struct fib6_config *cfg,
2704 struct net_device **_dev,
2705 struct inet6_dev **idev)
2707 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2708 struct net_device *dev = _dev ? *_dev : NULL;
2709 struct rt6_info *grt = NULL;
2710 int err = -EHOSTUNREACH;
2712 if (cfg->fc_table) {
2713 int flags = RT6_LOOKUP_F_IFACE;
2715 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2716 cfg->fc_table, flags);
2718 if (grt->rt6i_flags & RTF_GATEWAY ||
2719 (dev && dev != grt->dst.dev)) {
2727 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2733 if (dev != grt->dst.dev) {
2738 *_dev = dev = grt->dst.dev;
2739 *idev = grt->rt6i_idev;
2741 in6_dev_hold(grt->rt6i_idev);
2744 if (!(grt->rt6i_flags & RTF_GATEWAY))
2753 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2754 struct net_device **_dev, struct inet6_dev **idev,
2755 struct netlink_ext_ack *extack)
2757 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2758 int gwa_type = ipv6_addr_type(gw_addr);
2759 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2760 const struct net_device *dev = *_dev;
2761 bool need_addr_check = !dev;
2764 /* if gw_addr is local we will fail to detect this in case
2765 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2766 * will return already-added prefix route via interface that
2767 * prefix route was assigned to, which might be non-loopback.
2770 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2771 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2775 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2776 /* IPv6 strictly inhibits using not link-local
2777 * addresses as nexthop address.
2778 * Otherwise, router will not able to send redirects.
2779 * It is very good, but in some (rare!) circumstances
2780 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2781 * some exceptions. --ANK
2782 * We allow IPv4-mapped nexthops to support RFC4798-type
2785 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2786 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2790 if (cfg->fc_flags & RTNH_F_ONLINK)
2791 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2793 err = ip6_route_check_nh(net, cfg, _dev, idev);
2799 /* reload in case device was changed */
2804 NL_SET_ERR_MSG(extack, "Egress device not specified");
2806 } else if (dev->flags & IFF_LOOPBACK) {
2807 NL_SET_ERR_MSG(extack,
2808 "Egress device can not be loopback device for this route");
2812 /* if we did not check gw_addr above, do so now that the
2813 * egress device has been resolved.
2815 if (need_addr_check &&
2816 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2817 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2826 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2828 struct netlink_ext_ack *extack)
2830 struct net *net = cfg->fc_nlinfo.nl_net;
2831 struct rt6_info *rt = NULL;
2832 struct net_device *dev = NULL;
2833 struct inet6_dev *idev = NULL;
2834 struct fib6_table *table;
2838 /* RTF_PCPU is an internal flag; can not be set by userspace */
2839 if (cfg->fc_flags & RTF_PCPU) {
2840 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2844 /* RTF_CACHE is an internal flag; can not be set by userspace */
2845 if (cfg->fc_flags & RTF_CACHE) {
2846 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2850 if (cfg->fc_type > RTN_MAX) {
2851 NL_SET_ERR_MSG(extack, "Invalid route type");
2855 if (cfg->fc_dst_len > 128) {
2856 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2859 if (cfg->fc_src_len > 128) {
2860 NL_SET_ERR_MSG(extack, "Invalid source address length");
2863 #ifndef CONFIG_IPV6_SUBTREES
2864 if (cfg->fc_src_len) {
2865 NL_SET_ERR_MSG(extack,
2866 "Specifying source address requires IPV6_SUBTREES to be enabled");
2870 if (cfg->fc_ifindex) {
2872 dev = dev_get_by_index(net, cfg->fc_ifindex);
2875 idev = in6_dev_get(dev);
2880 if (cfg->fc_metric == 0)
2881 cfg->fc_metric = IP6_RT_PRIO_USER;
2883 if (cfg->fc_flags & RTNH_F_ONLINK) {
2885 NL_SET_ERR_MSG(extack,
2886 "Nexthop device required for onlink");
2891 if (!(dev->flags & IFF_UP)) {
2892 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2899 if (cfg->fc_nlinfo.nlh &&
2900 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2901 table = fib6_get_table(net, cfg->fc_table);
2903 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2904 table = fib6_new_table(net, cfg->fc_table);
2907 table = fib6_new_table(net, cfg->fc_table);
2914 rt = fib6_info_alloc(gfp_flags);
2918 if (cfg->fc_flags & RTF_ADDRCONF)
2919 rt->dst_nocount = true;
2921 err = ip6_convert_metrics(net, rt, cfg);
2925 if (cfg->fc_flags & RTF_EXPIRES)
2926 fib6_set_expires(rt, jiffies +
2927 clock_t_to_jiffies(cfg->fc_expires));
2929 fib6_clean_expires(rt);
2931 if (cfg->fc_protocol == RTPROT_UNSPEC)
2932 cfg->fc_protocol = RTPROT_BOOT;
2933 rt->rt6i_protocol = cfg->fc_protocol;
2935 addr_type = ipv6_addr_type(&cfg->fc_dst);
2937 if (cfg->fc_encap) {
2938 struct lwtunnel_state *lwtstate;
2940 err = lwtunnel_build_state(cfg->fc_encap_type,
2941 cfg->fc_encap, AF_INET6, cfg,
2945 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2948 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2949 rt->rt6i_dst.plen = cfg->fc_dst_len;
2950 if (rt->rt6i_dst.plen == 128)
2951 rt->dst_host = true;
2953 #ifdef CONFIG_IPV6_SUBTREES
2954 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2955 rt->rt6i_src.plen = cfg->fc_src_len;
2958 rt->rt6i_metric = cfg->fc_metric;
2959 rt->fib6_nh.nh_weight = 1;
2961 rt->fib6_type = cfg->fc_type;
2963 /* We cannot add true routes via loopback here,
2964 they would result in kernel looping; promote them to reject routes
2966 if ((cfg->fc_flags & RTF_REJECT) ||
2967 (dev && (dev->flags & IFF_LOOPBACK) &&
2968 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2969 !(cfg->fc_flags & RTF_LOCAL))) {
2970 /* hold loopback dev/idev if we haven't done so. */
2971 if (dev != net->loopback_dev) {
2976 dev = net->loopback_dev;
2978 idev = in6_dev_get(dev);
2984 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2988 if (cfg->fc_flags & RTF_GATEWAY) {
2989 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2993 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3000 if (idev->cnf.disable_ipv6) {
3001 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3006 if (!(dev->flags & IFF_UP)) {
3007 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3012 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3013 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3014 NL_SET_ERR_MSG(extack, "Invalid source address");
3018 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
3019 rt->rt6i_prefsrc.plen = 128;
3021 rt->rt6i_prefsrc.plen = 0;
3023 rt->rt6i_flags = cfg->fc_flags;
3026 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3027 !netif_carrier_ok(dev))
3028 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3029 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3030 rt->fib6_nh.nh_dev = dev;
3031 rt->rt6i_idev = idev;
3032 rt->rt6i_table = table;
3034 cfg->fc_nlinfo.nl_net = dev_net(dev);
3043 fib6_info_release(rt);
3044 return ERR_PTR(err);
3047 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3048 struct netlink_ext_ack *extack)
3050 struct rt6_info *rt;
3053 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3057 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3058 fib6_info_release(rt);
3063 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3065 struct net *net = info->nl_net;
3066 struct fib6_table *table;
3069 if (rt == net->ipv6.fib6_null_entry) {
3074 table = rt->rt6i_table;
3075 spin_lock_bh(&table->tb6_lock);
3076 err = fib6_del(rt, info);
3077 spin_unlock_bh(&table->tb6_lock);
3080 fib6_info_release(rt);
3084 int ip6_del_rt(struct net *net, struct rt6_info *rt)
3086 struct nl_info info = { .nl_net = net };
3088 return __ip6_del_rt(rt, &info);
3091 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3093 struct nl_info *info = &cfg->fc_nlinfo;
3094 struct net *net = info->nl_net;
3095 struct sk_buff *skb = NULL;
3096 struct fib6_table *table;
3099 if (rt == net->ipv6.fib6_null_entry)
3101 table = rt->rt6i_table;
3102 spin_lock_bh(&table->tb6_lock);
3104 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3105 struct rt6_info *sibling, *next_sibling;
3107 /* prefer to send a single notification with all hops */
3108 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3110 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3112 if (rt6_fill_node(net, skb, rt, NULL,
3113 NULL, NULL, 0, RTM_DELROUTE,
3114 info->portid, seq, 0) < 0) {
3118 info->skip_notify = 1;
3121 list_for_each_entry_safe(sibling, next_sibling,
3124 err = fib6_del(sibling, info);
3130 err = fib6_del(rt, info);
3132 spin_unlock_bh(&table->tb6_lock);
3134 fib6_info_release(rt);
3137 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3138 info->nlh, gfp_any());
3143 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3147 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3150 if (cfg->fc_flags & RTF_GATEWAY &&
3151 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3153 if (dst_hold_safe(&rt->dst))
3154 rc = rt6_remove_exception_rt(rt);
3159 static int ip6_route_del(struct fib6_config *cfg,
3160 struct netlink_ext_ack *extack)
3162 struct rt6_info *rt, *rt_cache;
3163 struct fib6_table *table;
3164 struct fib6_node *fn;
3167 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3169 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3175 fn = fib6_locate(&table->tb6_root,
3176 &cfg->fc_dst, cfg->fc_dst_len,
3177 &cfg->fc_src, cfg->fc_src_len,
3178 !(cfg->fc_flags & RTF_CACHE));
3181 for_each_fib6_node_rt_rcu(fn) {
3182 if (cfg->fc_flags & RTF_CACHE) {
3185 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3188 rc = ip6_del_cached_rt(rt_cache, cfg);
3194 if (cfg->fc_ifindex &&
3195 (!rt->fib6_nh.nh_dev ||
3196 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3198 if (cfg->fc_flags & RTF_GATEWAY &&
3199 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3201 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3203 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3208 /* if gateway was specified only delete the one hop */
3209 if (cfg->fc_flags & RTF_GATEWAY)
3210 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3212 return __ip6_del_rt_siblings(rt, cfg);
3220 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3222 struct netevent_redirect netevent;
3223 struct rt6_info *rt, *nrt = NULL;
3224 struct ndisc_options ndopts;
3225 struct inet6_dev *in6_dev;
3226 struct neighbour *neigh;
3228 int optlen, on_link;
3231 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3232 optlen -= sizeof(*msg);
3235 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3239 msg = (struct rd_msg *)icmp6_hdr(skb);
3241 if (ipv6_addr_is_multicast(&msg->dest)) {
3242 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3247 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3249 } else if (ipv6_addr_type(&msg->target) !=
3250 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3251 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3255 in6_dev = __in6_dev_get(skb->dev);
3258 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3262 * The IP source address of the Redirect MUST be the same as the current
3263 * first-hop router for the specified ICMP Destination Address.
3266 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3267 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3272 if (ndopts.nd_opts_tgt_lladdr) {
3273 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3276 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3281 rt = (struct rt6_info *) dst;
3282 if (rt->rt6i_flags & RTF_REJECT) {
3283 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3287 /* Redirect received -> path was valid.
3288 * Look, redirects are sent only in response to data packets,
3289 * so that this nexthop apparently is reachable. --ANK
3291 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3293 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3298 * We have finally decided to accept it.
3301 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3302 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3303 NEIGH_UPDATE_F_OVERRIDE|
3304 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3305 NEIGH_UPDATE_F_ISROUTER)),
3306 NDISC_REDIRECT, &ndopts);
3308 nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
3312 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3314 nrt->rt6i_flags &= ~RTF_GATEWAY;
3316 nrt->rt6i_protocol = RTPROT_REDIRECT;
3317 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3319 /* No need to remove rt from the exception table if rt is
3320 * a cached route because rt6_insert_exception() will
3323 if (rt6_insert_exception(nrt, rt->from)) {
3324 dst_release_immediate(&nrt->dst);
3328 netevent.old = &rt->dst;
3329 netevent.new = &nrt->dst;
3330 netevent.daddr = &msg->dest;
3331 netevent.neigh = neigh;
3332 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3335 neigh_release(neigh);
3338 #ifdef CONFIG_IPV6_ROUTE_INFO
3339 static struct rt6_info *rt6_get_route_info(struct net *net,
3340 const struct in6_addr *prefix, int prefixlen,
3341 const struct in6_addr *gwaddr,
3342 struct net_device *dev)
3344 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3345 int ifindex = dev->ifindex;
3346 struct fib6_node *fn;
3347 struct rt6_info *rt = NULL;
3348 struct fib6_table *table;
3350 table = fib6_get_table(net, tb_id);
3355 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3359 for_each_fib6_node_rt_rcu(fn) {
3360 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3362 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3364 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3366 ip6_hold_safe(NULL, &rt, false);
3374 static struct rt6_info *rt6_add_route_info(struct net *net,
3375 const struct in6_addr *prefix, int prefixlen,
3376 const struct in6_addr *gwaddr,
3377 struct net_device *dev,
3380 struct fib6_config cfg = {
3381 .fc_metric = IP6_RT_PRIO_USER,
3382 .fc_ifindex = dev->ifindex,
3383 .fc_dst_len = prefixlen,
3384 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3385 RTF_UP | RTF_PREF(pref),
3386 .fc_protocol = RTPROT_RA,
3387 .fc_type = RTN_UNICAST,
3388 .fc_nlinfo.portid = 0,
3389 .fc_nlinfo.nlh = NULL,
3390 .fc_nlinfo.nl_net = net,
3393 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3394 cfg.fc_dst = *prefix;
3395 cfg.fc_gateway = *gwaddr;
3397 /* We should treat it as a default route if prefix length is 0. */
3399 cfg.fc_flags |= RTF_DEFAULT;
3401 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3403 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3407 struct rt6_info *rt6_get_dflt_router(struct net *net,
3408 const struct in6_addr *addr,
3409 struct net_device *dev)
3411 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3412 struct rt6_info *rt;
3413 struct fib6_table *table;
3415 table = fib6_get_table(net, tb_id);
3420 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3421 if (dev == rt->fib6_nh.nh_dev &&
3422 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3423 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3427 ip6_hold_safe(NULL, &rt, false);
3432 struct rt6_info *rt6_add_dflt_router(struct net *net,
3433 const struct in6_addr *gwaddr,
3434 struct net_device *dev,
3437 struct fib6_config cfg = {
3438 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3439 .fc_metric = IP6_RT_PRIO_USER,
3440 .fc_ifindex = dev->ifindex,
3441 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3442 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3443 .fc_protocol = RTPROT_RA,
3444 .fc_type = RTN_UNICAST,
3445 .fc_nlinfo.portid = 0,
3446 .fc_nlinfo.nlh = NULL,
3447 .fc_nlinfo.nl_net = net,
3450 cfg.fc_gateway = *gwaddr;
3452 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3453 struct fib6_table *table;
3455 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3457 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3460 return rt6_get_dflt_router(net, gwaddr, dev);
3463 static void __rt6_purge_dflt_routers(struct net *net,
3464 struct fib6_table *table)
3466 struct rt6_info *rt;
3470 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3471 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3472 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3475 ip6_del_rt(net, rt);
3481 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3484 void rt6_purge_dflt_routers(struct net *net)
3486 struct fib6_table *table;
3487 struct hlist_head *head;
3492 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3493 head = &net->ipv6.fib_table_hash[h];
3494 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3495 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3496 __rt6_purge_dflt_routers(net, table);
3503 static void rtmsg_to_fib6_config(struct net *net,
3504 struct in6_rtmsg *rtmsg,
3505 struct fib6_config *cfg)
3507 memset(cfg, 0, sizeof(*cfg));
3509 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3511 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3512 cfg->fc_metric = rtmsg->rtmsg_metric;
3513 cfg->fc_expires = rtmsg->rtmsg_info;
3514 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3515 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3516 cfg->fc_flags = rtmsg->rtmsg_flags;
3517 cfg->fc_type = rtmsg->rtmsg_type;
3519 cfg->fc_nlinfo.nl_net = net;
3521 cfg->fc_dst = rtmsg->rtmsg_dst;
3522 cfg->fc_src = rtmsg->rtmsg_src;
3523 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3526 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3528 struct fib6_config cfg;
3529 struct in6_rtmsg rtmsg;
3533 case SIOCADDRT: /* Add a route */
3534 case SIOCDELRT: /* Delete a route */
3535 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3537 err = copy_from_user(&rtmsg, arg,
3538 sizeof(struct in6_rtmsg));
3542 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3547 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3550 err = ip6_route_del(&cfg, NULL);
3564 * Drop the packet on the floor
3567 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3570 struct dst_entry *dst = skb_dst(skb);
3571 switch (ipstats_mib_noroutes) {
3572 case IPSTATS_MIB_INNOROUTES:
3573 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3574 if (type == IPV6_ADDR_ANY) {
3575 IP6_INC_STATS(dev_net(dst->dev),
3576 __in6_dev_get_safely(skb->dev),
3577 IPSTATS_MIB_INADDRERRORS);
3581 case IPSTATS_MIB_OUTNOROUTES:
3582 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3583 ipstats_mib_noroutes);
3586 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3591 static int ip6_pkt_discard(struct sk_buff *skb)
3593 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3596 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3598 skb->dev = skb_dst(skb)->dev;
3599 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3602 static int ip6_pkt_prohibit(struct sk_buff *skb)
3604 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3607 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3609 skb->dev = skb_dst(skb)->dev;
3610 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3614 * Allocate a dst for local (unicast / anycast) address.
3617 struct rt6_info *addrconf_dst_alloc(struct net *net,
3618 struct inet6_dev *idev,
3619 const struct in6_addr *addr,
3620 bool anycast, gfp_t gfp_flags)
3623 struct net_device *dev = idev->dev;
3624 struct rt6_info *rt;
3626 rt = fib6_info_alloc(gfp_flags);
3628 return ERR_PTR(-ENOMEM);
3630 rt->dst_nocount = true;
3633 rt->rt6i_idev = idev;
3635 rt->dst_host = true;
3636 rt->rt6i_protocol = RTPROT_KERNEL;
3637 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3639 rt->fib6_type = RTN_ANYCAST;
3640 rt->rt6i_flags |= RTF_ANYCAST;
3642 rt->fib6_type = RTN_LOCAL;
3643 rt->rt6i_flags |= RTF_LOCAL;
3646 rt->fib6_nh.nh_gw = *addr;
3648 rt->fib6_nh.nh_dev = dev;
3649 rt->rt6i_dst.addr = *addr;
3650 rt->rt6i_dst.plen = 128;
3651 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3652 rt->rt6i_table = fib6_get_table(net, tb_id);
3657 /* remove deleted ip from prefsrc entries */
3658 struct arg_dev_net_ip {
3659 struct net_device *dev;
3661 struct in6_addr *addr;
3664 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3666 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3667 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3668 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3670 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3671 rt != net->ipv6.fib6_null_entry &&
3672 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3673 spin_lock_bh(&rt6_exception_lock);
3674 /* remove prefsrc entry */
3675 rt->rt6i_prefsrc.plen = 0;
3676 /* need to update cache as well */
3677 rt6_exceptions_remove_prefsrc(rt);
3678 spin_unlock_bh(&rt6_exception_lock);
3683 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3685 struct net *net = dev_net(ifp->idev->dev);
3686 struct arg_dev_net_ip adni = {
3687 .dev = ifp->idev->dev,
3691 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3694 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3696 /* Remove routers and update dst entries when gateway turn into host. */
3697 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3699 struct in6_addr *gateway = (struct in6_addr *)arg;
3701 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3702 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3706 /* Further clean up cached routes in exception table.
3707 * This is needed because cached route may have a different
3708 * gateway than its 'parent' in the case of an ip redirect.
3710 rt6_exceptions_clean_tohost(rt, gateway);
3715 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3717 fib6_clean_all(net, fib6_clean_tohost, gateway);
3720 struct arg_netdev_event {
3721 const struct net_device *dev;
3723 unsigned int nh_flags;
3724 unsigned long event;
3728 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3730 struct rt6_info *iter;
3731 struct fib6_node *fn;
3733 fn = rcu_dereference_protected(rt->rt6i_node,
3734 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3735 iter = rcu_dereference_protected(fn->leaf,
3736 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3738 if (iter->rt6i_metric == rt->rt6i_metric &&
3739 rt6_qualify_for_ecmp(iter))
3741 iter = rcu_dereference_protected(iter->rt6_next,
3742 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3748 static bool rt6_is_dead(const struct rt6_info *rt)
3750 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3751 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3752 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3758 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3760 struct rt6_info *iter;
3763 if (!rt6_is_dead(rt))
3764 total += rt->fib6_nh.nh_weight;
3766 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3767 if (!rt6_is_dead(iter))
3768 total += iter->fib6_nh.nh_weight;
3774 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3776 int upper_bound = -1;
3778 if (!rt6_is_dead(rt)) {
3779 *weight += rt->fib6_nh.nh_weight;
3780 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3783 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3786 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3788 struct rt6_info *iter;
3791 rt6_upper_bound_set(rt, &weight, total);
3793 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3794 rt6_upper_bound_set(iter, &weight, total);
3797 void rt6_multipath_rebalance(struct rt6_info *rt)
3799 struct rt6_info *first;
3802 /* In case the entire multipath route was marked for flushing,
3803 * then there is no need to rebalance upon the removal of every
3806 if (!rt->rt6i_nsiblings || rt->should_flush)
3809 /* During lookup routes are evaluated in order, so we need to
3810 * make sure upper bounds are assigned from the first sibling
3813 first = rt6_multipath_first_sibling(rt);
3814 if (WARN_ON_ONCE(!first))
3817 total = rt6_multipath_total_weight(first);
3818 rt6_multipath_upper_bound_set(first, total);
3821 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3823 const struct arg_netdev_event *arg = p_arg;
3824 struct net *net = dev_net(arg->dev);
3826 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3827 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3828 fib6_update_sernum_upto_root(net, rt);
3829 rt6_multipath_rebalance(rt);
3835 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3837 struct arg_netdev_event arg = {
3840 .nh_flags = nh_flags,
3844 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3845 arg.nh_flags |= RTNH_F_LINKDOWN;
3847 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3850 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3851 const struct net_device *dev)
3853 struct rt6_info *iter;
3855 if (rt->fib6_nh.nh_dev == dev)
3857 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3858 if (iter->fib6_nh.nh_dev == dev)
3864 static void rt6_multipath_flush(struct rt6_info *rt)
3866 struct rt6_info *iter;
3868 rt->should_flush = 1;
3869 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3870 iter->should_flush = 1;
3873 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3874 const struct net_device *down_dev)
3876 struct rt6_info *iter;
3877 unsigned int dead = 0;
3879 if (rt->fib6_nh.nh_dev == down_dev ||
3880 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3882 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3883 if (iter->fib6_nh.nh_dev == down_dev ||
3884 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3890 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3891 const struct net_device *dev,
3892 unsigned int nh_flags)
3894 struct rt6_info *iter;
3896 if (rt->fib6_nh.nh_dev == dev)
3897 rt->fib6_nh.nh_flags |= nh_flags;
3898 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3899 if (iter->fib6_nh.nh_dev == dev)
3900 iter->fib6_nh.nh_flags |= nh_flags;
3903 /* called with write lock held for table with rt */
3904 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3906 const struct arg_netdev_event *arg = p_arg;
3907 const struct net_device *dev = arg->dev;
3908 struct net *net = dev_net(dev);
3910 if (rt == net->ipv6.fib6_null_entry)
3913 switch (arg->event) {
3914 case NETDEV_UNREGISTER:
3915 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3917 if (rt->should_flush)
3919 if (!rt->rt6i_nsiblings)
3920 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3921 if (rt6_multipath_uses_dev(rt, dev)) {
3924 count = rt6_multipath_dead_count(rt, dev);
3925 if (rt->rt6i_nsiblings + 1 == count) {
3926 rt6_multipath_flush(rt);
3929 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3931 fib6_update_sernum(net, rt);
3932 rt6_multipath_rebalance(rt);
3936 if (rt->fib6_nh.nh_dev != dev ||
3937 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3939 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3940 rt6_multipath_rebalance(rt);
3947 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3949 struct arg_netdev_event arg = {
3956 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3959 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3961 rt6_sync_down_dev(dev, event);
3962 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3963 neigh_ifdown(&nd_tbl, dev);
3966 struct rt6_mtu_change_arg {
3967 struct net_device *dev;
3971 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3973 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3974 struct inet6_dev *idev;
3976 /* In IPv6 pmtu discovery is not optional,
3977 so that RTAX_MTU lock cannot disable it.
3978 We still use this lock to block changes
3979 caused by addrconf/ndisc.
3982 idev = __in6_dev_get(arg->dev);
3986 /* For administrative MTU increase, there is no way to discover
3987 IPv6 PMTU increase, so PMTU increase should be updated here.
3988 Since RFC 1981 doesn't include administrative MTU increase
3989 update PMTU increase is a MUST. (i.e. jumbo frame)
3991 if (rt->fib6_nh.nh_dev == arg->dev &&
3992 !fib6_metric_locked(rt, RTAX_MTU)) {
3993 u32 mtu = rt->fib6_pmtu;
3995 if (mtu >= arg->mtu ||
3996 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
3997 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
3999 spin_lock_bh(&rt6_exception_lock);
4000 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4001 spin_unlock_bh(&rt6_exception_lock);
4006 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4008 struct rt6_mtu_change_arg arg = {
4013 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4016 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4017 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4018 [RTA_OIF] = { .type = NLA_U32 },
4019 [RTA_IIF] = { .type = NLA_U32 },
4020 [RTA_PRIORITY] = { .type = NLA_U32 },
4021 [RTA_METRICS] = { .type = NLA_NESTED },
4022 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4023 [RTA_PREF] = { .type = NLA_U8 },
4024 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4025 [RTA_ENCAP] = { .type = NLA_NESTED },
4026 [RTA_EXPIRES] = { .type = NLA_U32 },
4027 [RTA_UID] = { .type = NLA_U32 },
4028 [RTA_MARK] = { .type = NLA_U32 },
4031 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4032 struct fib6_config *cfg,
4033 struct netlink_ext_ack *extack)
4036 struct nlattr *tb[RTA_MAX+1];
4040 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4046 rtm = nlmsg_data(nlh);
4047 memset(cfg, 0, sizeof(*cfg));
4049 cfg->fc_table = rtm->rtm_table;
4050 cfg->fc_dst_len = rtm->rtm_dst_len;
4051 cfg->fc_src_len = rtm->rtm_src_len;
4052 cfg->fc_flags = RTF_UP;
4053 cfg->fc_protocol = rtm->rtm_protocol;
4054 cfg->fc_type = rtm->rtm_type;
4056 if (rtm->rtm_type == RTN_UNREACHABLE ||
4057 rtm->rtm_type == RTN_BLACKHOLE ||
4058 rtm->rtm_type == RTN_PROHIBIT ||
4059 rtm->rtm_type == RTN_THROW)
4060 cfg->fc_flags |= RTF_REJECT;
4062 if (rtm->rtm_type == RTN_LOCAL)
4063 cfg->fc_flags |= RTF_LOCAL;
4065 if (rtm->rtm_flags & RTM_F_CLONED)
4066 cfg->fc_flags |= RTF_CACHE;
4068 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4070 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4071 cfg->fc_nlinfo.nlh = nlh;
4072 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4074 if (tb[RTA_GATEWAY]) {
4075 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4076 cfg->fc_flags |= RTF_GATEWAY;
4080 int plen = (rtm->rtm_dst_len + 7) >> 3;
4082 if (nla_len(tb[RTA_DST]) < plen)
4085 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4089 int plen = (rtm->rtm_src_len + 7) >> 3;
4091 if (nla_len(tb[RTA_SRC]) < plen)
4094 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4097 if (tb[RTA_PREFSRC])
4098 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4101 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4103 if (tb[RTA_PRIORITY])
4104 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4106 if (tb[RTA_METRICS]) {
4107 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4108 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4112 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4114 if (tb[RTA_MULTIPATH]) {
4115 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4116 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4118 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4119 cfg->fc_mp_len, extack);
4125 pref = nla_get_u8(tb[RTA_PREF]);
4126 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4127 pref != ICMPV6_ROUTER_PREF_HIGH)
4128 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4129 cfg->fc_flags |= RTF_PREF(pref);
4133 cfg->fc_encap = tb[RTA_ENCAP];
4135 if (tb[RTA_ENCAP_TYPE]) {
4136 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4138 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4143 if (tb[RTA_EXPIRES]) {
4144 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4146 if (addrconf_finite_timeout(timeout)) {
4147 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4148 cfg->fc_flags |= RTF_EXPIRES;
4158 struct rt6_info *rt6_info;
4159 struct fib6_config r_cfg;
4160 struct list_head next;
4163 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4167 list_for_each_entry(nh, rt6_nh_list, next) {
4168 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4169 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4170 nh->r_cfg.fc_ifindex);
4174 static int ip6_route_info_append(struct net *net,
4175 struct list_head *rt6_nh_list,
4176 struct rt6_info *rt, struct fib6_config *r_cfg)
4181 list_for_each_entry(nh, rt6_nh_list, next) {
4182 /* check if rt6_info already exists */
4183 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4187 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4191 err = ip6_convert_metrics(net, rt, r_cfg);
4196 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4197 list_add_tail(&nh->next, rt6_nh_list);
4202 static void ip6_route_mpath_notify(struct rt6_info *rt,
4203 struct rt6_info *rt_last,
4204 struct nl_info *info,
4207 /* if this is an APPEND route, then rt points to the first route
4208 * inserted and rt_last points to last route inserted. Userspace
4209 * wants a consistent dump of the route which starts at the first
4210 * nexthop. Since sibling routes are always added at the end of
4211 * the list, find the first sibling of the last route appended
4213 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4214 rt = list_first_entry(&rt_last->rt6i_siblings,
4220 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4223 static int ip6_route_multipath_add(struct fib6_config *cfg,
4224 struct netlink_ext_ack *extack)
4226 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4227 struct nl_info *info = &cfg->fc_nlinfo;
4228 struct fib6_config r_cfg;
4229 struct rtnexthop *rtnh;
4230 struct rt6_info *rt;
4231 struct rt6_nh *err_nh;
4232 struct rt6_nh *nh, *nh_safe;
4238 int replace = (cfg->fc_nlinfo.nlh &&
4239 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4240 LIST_HEAD(rt6_nh_list);
4242 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4243 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4244 nlflags |= NLM_F_APPEND;
4246 remaining = cfg->fc_mp_len;
4247 rtnh = (struct rtnexthop *)cfg->fc_mp;
4249 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4250 * rt6_info structs per nexthop
4252 while (rtnh_ok(rtnh, remaining)) {
4253 memcpy(&r_cfg, cfg, sizeof(*cfg));
4254 if (rtnh->rtnh_ifindex)
4255 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4257 attrlen = rtnh_attrlen(rtnh);
4259 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4261 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4263 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4264 r_cfg.fc_flags |= RTF_GATEWAY;
4266 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4267 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4269 r_cfg.fc_encap_type = nla_get_u16(nla);
4272 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4273 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4280 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4282 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4285 fib6_info_release(rt);
4289 rtnh = rtnh_next(rtnh, &remaining);
4292 /* for add and replace send one notification with all nexthops.
4293 * Skip the notification in fib6_add_rt2node and send one with
4294 * the full route when done
4296 info->skip_notify = 1;
4299 list_for_each_entry(nh, &rt6_nh_list, next) {
4300 rt_last = nh->rt6_info;
4301 err = __ip6_ins_rt(nh->rt6_info, info, extack);
4302 fib6_info_release(nh->rt6_info);
4304 /* save reference to first route for notification */
4305 if (!rt_notif && !err)
4306 rt_notif = nh->rt6_info;
4308 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4309 nh->rt6_info = NULL;
4312 ip6_print_replace_route_err(&rt6_nh_list);
4317 /* Because each route is added like a single route we remove
4318 * these flags after the first nexthop: if there is a collision,
4319 * we have already failed to add the first nexthop:
4320 * fib6_add_rt2node() has rejected it; when replacing, old
4321 * nexthops have been replaced by first new, the rest should
4324 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4329 /* success ... tell user about new route */
4330 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4334 /* send notification for routes that were added so that
4335 * the delete notifications sent by ip6_route_del are
4339 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4341 /* Delete routes that were already added */
4342 list_for_each_entry(nh, &rt6_nh_list, next) {
4345 ip6_route_del(&nh->r_cfg, extack);
4349 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4351 fib6_info_release(nh->rt6_info);
4352 list_del(&nh->next);
4359 static int ip6_route_multipath_del(struct fib6_config *cfg,
4360 struct netlink_ext_ack *extack)
4362 struct fib6_config r_cfg;
4363 struct rtnexthop *rtnh;
4366 int err = 1, last_err = 0;
4368 remaining = cfg->fc_mp_len;
4369 rtnh = (struct rtnexthop *)cfg->fc_mp;
4371 /* Parse a Multipath Entry */
4372 while (rtnh_ok(rtnh, remaining)) {
4373 memcpy(&r_cfg, cfg, sizeof(*cfg));
4374 if (rtnh->rtnh_ifindex)
4375 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4377 attrlen = rtnh_attrlen(rtnh);
4379 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4381 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4383 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4384 r_cfg.fc_flags |= RTF_GATEWAY;
4387 err = ip6_route_del(&r_cfg, extack);
4391 rtnh = rtnh_next(rtnh, &remaining);
4397 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4398 struct netlink_ext_ack *extack)
4400 struct fib6_config cfg;
4403 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4408 return ip6_route_multipath_del(&cfg, extack);
4410 cfg.fc_delete_all_nh = 1;
4411 return ip6_route_del(&cfg, extack);
4415 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4416 struct netlink_ext_ack *extack)
4418 struct fib6_config cfg;
4421 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4426 return ip6_route_multipath_add(&cfg, extack);
4428 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4431 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4433 int nexthop_len = 0;
4435 if (rt->rt6i_nsiblings) {
4436 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4437 + NLA_ALIGN(sizeof(struct rtnexthop))
4438 + nla_total_size(16) /* RTA_GATEWAY */
4439 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4441 nexthop_len *= rt->rt6i_nsiblings;
4444 return NLMSG_ALIGN(sizeof(struct rtmsg))
4445 + nla_total_size(16) /* RTA_SRC */
4446 + nla_total_size(16) /* RTA_DST */
4447 + nla_total_size(16) /* RTA_GATEWAY */
4448 + nla_total_size(16) /* RTA_PREFSRC */
4449 + nla_total_size(4) /* RTA_TABLE */
4450 + nla_total_size(4) /* RTA_IIF */
4451 + nla_total_size(4) /* RTA_OIF */
4452 + nla_total_size(4) /* RTA_PRIORITY */
4453 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4454 + nla_total_size(sizeof(struct rta_cacheinfo))
4455 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4456 + nla_total_size(1) /* RTA_PREF */
4457 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4461 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4462 unsigned int *flags, bool skip_oif)
4464 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4465 *flags |= RTNH_F_DEAD;
4467 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4468 *flags |= RTNH_F_LINKDOWN;
4469 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4470 *flags |= RTNH_F_DEAD;
4473 if (rt->rt6i_flags & RTF_GATEWAY) {
4474 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4475 goto nla_put_failure;
4478 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4479 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4480 *flags |= RTNH_F_OFFLOAD;
4482 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4483 if (!skip_oif && rt->fib6_nh.nh_dev &&
4484 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4485 goto nla_put_failure;
4487 if (rt->fib6_nh.nh_lwtstate &&
4488 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4489 goto nla_put_failure;
4497 /* add multipath next hop */
4498 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4500 const struct net_device *dev = rt->fib6_nh.nh_dev;
4501 struct rtnexthop *rtnh;
4502 unsigned int flags = 0;
4504 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4506 goto nla_put_failure;
4508 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4509 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4511 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4512 goto nla_put_failure;
4514 rtnh->rtnh_flags = flags;
4516 /* length of rtnetlink header + attributes */
4517 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4525 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4526 struct rt6_info *rt, struct dst_entry *dst,
4527 struct in6_addr *dest, struct in6_addr *src,
4528 int iif, int type, u32 portid, u32 seq,
4532 struct nlmsghdr *nlh;
4537 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4541 rtm = nlmsg_data(nlh);
4542 rtm->rtm_family = AF_INET6;
4543 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4544 rtm->rtm_src_len = rt->rt6i_src.plen;
4547 table = rt->rt6i_table->tb6_id;
4549 table = RT6_TABLE_UNSPEC;
4550 rtm->rtm_table = table;
4551 if (nla_put_u32(skb, RTA_TABLE, table))
4552 goto nla_put_failure;
4554 rtm->rtm_type = rt->fib6_type;
4556 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4557 rtm->rtm_protocol = rt->rt6i_protocol;
4559 if (rt->rt6i_flags & RTF_CACHE)
4560 rtm->rtm_flags |= RTM_F_CLONED;
4563 if (nla_put_in6_addr(skb, RTA_DST, dest))
4564 goto nla_put_failure;
4565 rtm->rtm_dst_len = 128;
4566 } else if (rtm->rtm_dst_len)
4567 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4568 goto nla_put_failure;
4569 #ifdef CONFIG_IPV6_SUBTREES
4571 if (nla_put_in6_addr(skb, RTA_SRC, src))
4572 goto nla_put_failure;
4573 rtm->rtm_src_len = 128;
4574 } else if (rtm->rtm_src_len &&
4575 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4576 goto nla_put_failure;
4579 #ifdef CONFIG_IPV6_MROUTE
4580 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4581 int err = ip6mr_get_route(net, skb, rtm, portid);
4586 goto nla_put_failure;
4589 if (nla_put_u32(skb, RTA_IIF, iif))
4590 goto nla_put_failure;
4592 struct in6_addr saddr_buf;
4593 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4594 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4595 goto nla_put_failure;
4598 if (rt->rt6i_prefsrc.plen) {
4599 struct in6_addr saddr_buf;
4600 saddr_buf = rt->rt6i_prefsrc.addr;
4601 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4602 goto nla_put_failure;
4605 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4606 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4607 goto nla_put_failure;
4609 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4610 goto nla_put_failure;
4612 /* For multipath routes, walk the siblings list and add
4613 * each as a nexthop within RTA_MULTIPATH.
4615 if (rt->rt6i_nsiblings) {
4616 struct rt6_info *sibling, *next_sibling;
4619 mp = nla_nest_start(skb, RTA_MULTIPATH);
4621 goto nla_put_failure;
4623 if (rt6_add_nexthop(skb, rt) < 0)
4624 goto nla_put_failure;
4626 list_for_each_entry_safe(sibling, next_sibling,
4627 &rt->rt6i_siblings, rt6i_siblings) {
4628 if (rt6_add_nexthop(skb, sibling) < 0)
4629 goto nla_put_failure;
4632 nla_nest_end(skb, mp);
4634 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4635 goto nla_put_failure;
4638 if (rt->rt6i_flags & RTF_EXPIRES) {
4639 expires = dst ? dst->expires : rt->expires;
4643 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4644 goto nla_put_failure;
4646 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4647 goto nla_put_failure;
4650 nlmsg_end(skb, nlh);
4654 nlmsg_cancel(skb, nlh);
4658 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4660 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4661 struct net *net = arg->net;
4663 if (rt == net->ipv6.fib6_null_entry)
4666 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4667 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4669 /* user wants prefix routes only */
4670 if (rtm->rtm_flags & RTM_F_PREFIX &&
4671 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4672 /* success since this is not a prefix route */
4677 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4678 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4679 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4682 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4683 struct netlink_ext_ack *extack)
4685 struct net *net = sock_net(in_skb->sk);
4686 struct nlattr *tb[RTA_MAX+1];
4687 int err, iif = 0, oif = 0;
4688 struct dst_entry *dst;
4689 struct rt6_info *rt;
4690 struct sk_buff *skb;
4695 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4701 memset(&fl6, 0, sizeof(fl6));
4702 rtm = nlmsg_data(nlh);
4703 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4704 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4707 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4710 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4714 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4717 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4721 iif = nla_get_u32(tb[RTA_IIF]);
4724 oif = nla_get_u32(tb[RTA_OIF]);
4727 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4730 fl6.flowi6_uid = make_kuid(current_user_ns(),
4731 nla_get_u32(tb[RTA_UID]));
4733 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4736 struct net_device *dev;
4741 dev = dev_get_by_index_rcu(net, iif);
4748 fl6.flowi6_iif = iif;
4750 if (!ipv6_addr_any(&fl6.saddr))
4751 flags |= RT6_LOOKUP_F_HAS_SADDR;
4753 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4757 fl6.flowi6_oif = oif;
4759 dst = ip6_route_output(net, NULL, &fl6);
4763 rt = container_of(dst, struct rt6_info, dst);
4764 if (rt->dst.error) {
4765 err = rt->dst.error;
4770 if (rt == net->ipv6.ip6_null_entry) {
4771 err = rt->dst.error;
4776 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4783 skb_dst_set(skb, &rt->dst);
4785 err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
4786 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4789 err = rt6_fill_node(net, skb, rt->from, dst,
4790 &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
4791 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4798 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4803 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4804 unsigned int nlm_flags)
4806 struct sk_buff *skb;
4807 struct net *net = info->nl_net;
4812 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4814 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4818 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4819 event, info->portid, seq, nlm_flags);
4821 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4822 WARN_ON(err == -EMSGSIZE);
4826 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4827 info->nlh, gfp_any());
4831 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4834 static int ip6_route_dev_notify(struct notifier_block *this,
4835 unsigned long event, void *ptr)
4837 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4838 struct net *net = dev_net(dev);
4840 if (!(dev->flags & IFF_LOOPBACK))
4843 if (event == NETDEV_REGISTER) {
4844 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4845 net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
4846 net->ipv6.ip6_null_entry->dst.dev = dev;
4847 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4848 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4849 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4850 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4851 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4852 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4854 } else if (event == NETDEV_UNREGISTER &&
4855 dev->reg_state != NETREG_UNREGISTERED) {
4856 /* NETDEV_UNREGISTER could be fired for multiple times by
4857 * netdev_wait_allrefs(). Make sure we only call this once.
4859 in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
4860 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4861 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4862 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4863 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4874 #ifdef CONFIG_PROC_FS
4876 static const struct file_operations ipv6_route_proc_fops = {
4877 .open = ipv6_route_open,
4879 .llseek = seq_lseek,
4880 .release = seq_release_net,
4883 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4885 struct net *net = (struct net *)seq->private;
4886 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4887 net->ipv6.rt6_stats->fib_nodes,
4888 net->ipv6.rt6_stats->fib_route_nodes,
4889 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4890 net->ipv6.rt6_stats->fib_rt_entries,
4891 net->ipv6.rt6_stats->fib_rt_cache,
4892 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4893 net->ipv6.rt6_stats->fib_discarded_routes);
4898 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4900 return single_open_net(inode, file, rt6_stats_seq_show);
4903 static const struct file_operations rt6_stats_seq_fops = {
4904 .open = rt6_stats_seq_open,
4906 .llseek = seq_lseek,
4907 .release = single_release_net,
4909 #endif /* CONFIG_PROC_FS */
4911 #ifdef CONFIG_SYSCTL
4914 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4915 void __user *buffer, size_t *lenp, loff_t *ppos)
4922 net = (struct net *)ctl->extra1;
4923 delay = net->ipv6.sysctl.flush_delay;
4924 proc_dointvec(ctl, write, buffer, lenp, ppos);
4925 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4929 struct ctl_table ipv6_route_table_template[] = {
4931 .procname = "flush",
4932 .data = &init_net.ipv6.sysctl.flush_delay,
4933 .maxlen = sizeof(int),
4935 .proc_handler = ipv6_sysctl_rtcache_flush
4938 .procname = "gc_thresh",
4939 .data = &ip6_dst_ops_template.gc_thresh,
4940 .maxlen = sizeof(int),
4942 .proc_handler = proc_dointvec,
4945 .procname = "max_size",
4946 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4947 .maxlen = sizeof(int),
4949 .proc_handler = proc_dointvec,
4952 .procname = "gc_min_interval",
4953 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4954 .maxlen = sizeof(int),
4956 .proc_handler = proc_dointvec_jiffies,
4959 .procname = "gc_timeout",
4960 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4961 .maxlen = sizeof(int),
4963 .proc_handler = proc_dointvec_jiffies,
4966 .procname = "gc_interval",
4967 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4968 .maxlen = sizeof(int),
4970 .proc_handler = proc_dointvec_jiffies,
4973 .procname = "gc_elasticity",
4974 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4975 .maxlen = sizeof(int),
4977 .proc_handler = proc_dointvec,
4980 .procname = "mtu_expires",
4981 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4982 .maxlen = sizeof(int),
4984 .proc_handler = proc_dointvec_jiffies,
4987 .procname = "min_adv_mss",
4988 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4989 .maxlen = sizeof(int),
4991 .proc_handler = proc_dointvec,
4994 .procname = "gc_min_interval_ms",
4995 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4996 .maxlen = sizeof(int),
4998 .proc_handler = proc_dointvec_ms_jiffies,
5003 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5005 struct ctl_table *table;
5007 table = kmemdup(ipv6_route_table_template,
5008 sizeof(ipv6_route_table_template),
5012 table[0].data = &net->ipv6.sysctl.flush_delay;
5013 table[0].extra1 = net;
5014 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5015 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5016 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5017 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5018 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5019 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5020 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5021 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5022 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5024 /* Don't export sysctls to unprivileged users */
5025 if (net->user_ns != &init_user_ns)
5026 table[0].procname = NULL;
5033 static int __net_init ip6_route_net_init(struct net *net)
5037 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5038 sizeof(net->ipv6.ip6_dst_ops));
5040 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5041 goto out_ip6_dst_ops;
5043 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5044 sizeof(*net->ipv6.fib6_null_entry),
5046 if (!net->ipv6.fib6_null_entry)
5047 goto out_ip6_dst_entries;
5049 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5050 sizeof(*net->ipv6.ip6_null_entry),
5052 if (!net->ipv6.ip6_null_entry)
5053 goto out_fib6_null_entry;
5054 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5055 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5056 ip6_template_metrics, true);
5058 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5059 net->ipv6.fib6_has_custom_rules = false;
5060 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5061 sizeof(*net->ipv6.ip6_prohibit_entry),
5063 if (!net->ipv6.ip6_prohibit_entry)
5064 goto out_ip6_null_entry;
5065 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5066 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5067 ip6_template_metrics, true);
5069 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5070 sizeof(*net->ipv6.ip6_blk_hole_entry),
5072 if (!net->ipv6.ip6_blk_hole_entry)
5073 goto out_ip6_prohibit_entry;
5074 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5075 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5076 ip6_template_metrics, true);
5079 net->ipv6.sysctl.flush_delay = 0;
5080 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5081 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5082 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5083 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5084 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5085 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5086 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5088 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5094 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5095 out_ip6_prohibit_entry:
5096 kfree(net->ipv6.ip6_prohibit_entry);
5098 kfree(net->ipv6.ip6_null_entry);
5100 out_fib6_null_entry:
5101 kfree(net->ipv6.fib6_null_entry);
5102 out_ip6_dst_entries:
5103 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5108 static void __net_exit ip6_route_net_exit(struct net *net)
5110 kfree(net->ipv6.fib6_null_entry);
5111 kfree(net->ipv6.ip6_null_entry);
5112 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113 kfree(net->ipv6.ip6_prohibit_entry);
5114 kfree(net->ipv6.ip6_blk_hole_entry);
5116 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5119 static int __net_init ip6_route_net_init_late(struct net *net)
5121 #ifdef CONFIG_PROC_FS
5122 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5123 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5128 static void __net_exit ip6_route_net_exit_late(struct net *net)
5130 #ifdef CONFIG_PROC_FS
5131 remove_proc_entry("ipv6_route", net->proc_net);
5132 remove_proc_entry("rt6_stats", net->proc_net);
5136 static struct pernet_operations ip6_route_net_ops = {
5137 .init = ip6_route_net_init,
5138 .exit = ip6_route_net_exit,
5141 static int __net_init ipv6_inetpeer_init(struct net *net)
5143 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5147 inet_peer_base_init(bp);
5148 net->ipv6.peers = bp;
5152 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5154 struct inet_peer_base *bp = net->ipv6.peers;
5156 net->ipv6.peers = NULL;
5157 inetpeer_invalidate_tree(bp);
5161 static struct pernet_operations ipv6_inetpeer_ops = {
5162 .init = ipv6_inetpeer_init,
5163 .exit = ipv6_inetpeer_exit,
5166 static struct pernet_operations ip6_route_net_late_ops = {
5167 .init = ip6_route_net_init_late,
5168 .exit = ip6_route_net_exit_late,
5171 static struct notifier_block ip6_route_dev_notifier = {
5172 .notifier_call = ip6_route_dev_notify,
5173 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5176 void __init ip6_route_init_special_entries(void)
5178 /* Registering of the loopback is done before this portion of code,
5179 * the loopback reference in rt6_info will not be taken, do it
5180 * manually for init_net */
5181 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5182 init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5183 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5184 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5185 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5186 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5187 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5188 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5189 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5193 int __init ip6_route_init(void)
5199 ip6_dst_ops_template.kmem_cachep =
5200 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5201 SLAB_HWCACHE_ALIGN, NULL);
5202 if (!ip6_dst_ops_template.kmem_cachep)
5205 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5207 goto out_kmem_cache;
5209 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5211 goto out_dst_entries;
5213 ret = register_pernet_subsys(&ip6_route_net_ops);
5215 goto out_register_inetpeer;
5217 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5221 goto out_register_subsys;
5227 ret = fib6_rules_init();
5231 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5233 goto fib6_rules_init;
5235 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5236 inet6_rtm_newroute, NULL, 0);
5238 goto out_register_late_subsys;
5240 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5241 inet6_rtm_delroute, NULL, 0);
5243 goto out_register_late_subsys;
5245 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5246 inet6_rtm_getroute, NULL,
5247 RTNL_FLAG_DOIT_UNLOCKED);
5249 goto out_register_late_subsys;
5251 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5253 goto out_register_late_subsys;
5255 for_each_possible_cpu(cpu) {
5256 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5258 INIT_LIST_HEAD(&ul->head);
5259 spin_lock_init(&ul->lock);
5265 out_register_late_subsys:
5266 rtnl_unregister_all(PF_INET6);
5267 unregister_pernet_subsys(&ip6_route_net_late_ops);
5269 fib6_rules_cleanup();
5274 out_register_subsys:
5275 unregister_pernet_subsys(&ip6_route_net_ops);
5276 out_register_inetpeer:
5277 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5279 dst_entries_destroy(&ip6_dst_blackhole_ops);
5281 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5285 void ip6_route_cleanup(void)
5287 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5288 unregister_pernet_subsys(&ip6_route_net_late_ops);
5289 fib6_rules_cleanup();
5292 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5293 unregister_pernet_subsys(&ip6_route_net_ops);
5294 dst_entries_destroy(&ip6_dst_blackhole_ops);
5295 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);