2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 struct fib6_info *rt, struct dst_entry *dst,
103 struct in6_addr *dest, struct in6_addr *src,
104 int iif, int type, u32 portid, u32 seq,
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 struct in6_addr *daddr,
108 struct in6_addr *saddr);
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 const struct in6_addr *prefix, int prefixlen,
113 const struct in6_addr *gwaddr,
114 struct net_device *dev,
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 const struct in6_addr *prefix, int prefixlen,
118 const struct in6_addr *gwaddr,
119 struct net_device *dev);
122 struct uncached_list {
124 struct list_head head;
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
129 void rt6_uncached_list_add(struct rt6_info *rt)
131 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
133 rt->rt6i_uncached_list = ul;
135 spin_lock_bh(&ul->lock);
136 list_add_tail(&rt->rt6i_uncached, &ul->head);
137 spin_unlock_bh(&ul->lock);
140 void rt6_uncached_list_del(struct rt6_info *rt)
142 if (!list_empty(&rt->rt6i_uncached)) {
143 struct uncached_list *ul = rt->rt6i_uncached_list;
144 struct net *net = dev_net(rt->dst.dev);
146 spin_lock_bh(&ul->lock);
147 list_del(&rt->rt6i_uncached);
148 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 spin_unlock_bh(&ul->lock);
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
155 struct net_device *loopback_dev = net->loopback_dev;
158 if (dev == loopback_dev)
161 for_each_possible_cpu(cpu) {
162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 spin_lock_bh(&ul->lock);
166 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 struct inet6_dev *rt_idev = rt->rt6i_idev;
168 struct net_device *rt_dev = rt->dst.dev;
170 if (rt_idev->dev == dev) {
171 rt->rt6i_idev = in6_dev_get(loopback_dev);
172 in6_dev_put(rt_idev);
176 rt->dst.dev = loopback_dev;
177 dev_hold(rt->dst.dev);
181 spin_unlock_bh(&ul->lock);
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 if (!ipv6_addr_any(p))
190 return (const void *) p;
192 return &ipv6_hdr(skb)->daddr;
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 struct net_device *dev,
203 daddr = choose_neigh_daddr(gw, skb, daddr);
204 n = __ipv6_neigh_lookup(dev, daddr);
207 return neigh_create(&nd_tbl, daddr, dev);
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
214 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
216 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
221 struct net_device *dev = dst->dev;
222 struct rt6_info *rt = (struct rt6_info *)dst;
224 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
227 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
229 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
231 __ipv6_confirm_neigh(dev, daddr);
234 static struct dst_ops ip6_dst_ops_template = {
238 .check = ip6_dst_check,
239 .default_advmss = ip6_default_advmss,
241 .cow_metrics = dst_cow_metrics_generic,
242 .destroy = ip6_dst_destroy,
243 .ifdown = ip6_dst_ifdown,
244 .negative_advice = ip6_negative_advice,
245 .link_failure = ip6_link_failure,
246 .update_pmtu = ip6_rt_update_pmtu,
247 .redirect = rt6_do_redirect,
248 .local_out = __ip6_local_out,
249 .neigh_lookup = ip6_dst_neigh_lookup,
250 .confirm_neigh = ip6_confirm_neigh,
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
255 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
257 return mtu ? : dst->dev->mtu;
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 struct sk_buff *skb, u32 mtu)
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
270 static struct dst_ops ip6_dst_blackhole_ops = {
272 .destroy = ip6_dst_destroy,
273 .check = ip6_dst_check,
274 .mtu = ip6_blackhole_mtu,
275 .default_advmss = ip6_default_advmss,
276 .update_pmtu = ip6_rt_blackhole_update_pmtu,
277 .redirect = ip6_rt_blackhole_redirect,
278 .cow_metrics = dst_cow_metrics_generic,
279 .neigh_lookup = ip6_dst_neigh_lookup,
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
286 static const struct fib6_info fib6_null_entry_template = {
287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .fib6_protocol = RTPROT_KERNEL,
289 .fib6_metric = ~(u32)0,
290 .fib6_ref = ATOMIC_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
295 static const struct rt6_info ip6_null_entry_template = {
297 .__refcnt = ATOMIC_INIT(1),
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template = {
311 .__refcnt = ATOMIC_INIT(1),
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 static const struct rt6_info ip6_blk_hole_entry_template = {
323 .__refcnt = ATOMIC_INIT(1),
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
327 .input = dst_discard,
328 .output = dst_discard_out,
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
335 static void rt6_info_init(struct rt6_info *rt)
337 struct dst_entry *dst = &rt->dst;
339 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 INIT_LIST_HEAD(&rt->rt6i_uncached);
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
347 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 1, DST_OBSOLETE_FORCE_CHK, flags);
352 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
357 EXPORT_SYMBOL(ip6_dst_alloc);
359 static void ip6_dst_destroy(struct dst_entry *dst)
361 struct rt6_info *rt = (struct rt6_info *)dst;
362 struct fib6_info *from = rt->from;
363 struct inet6_dev *idev;
365 dst_destroy_metrics_generic(dst);
366 rt6_uncached_list_del(rt);
368 idev = rt->rt6i_idev;
370 rt->rt6i_idev = NULL;
375 fib6_info_release(from);
378 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
381 struct rt6_info *rt = (struct rt6_info *)dst;
382 struct inet6_dev *idev = rt->rt6i_idev;
383 struct net_device *loopback_dev =
384 dev_net(dev)->loopback_dev;
386 if (idev && idev->dev != loopback_dev) {
387 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
389 rt->rt6i_idev = loopback_idev;
395 static bool __rt6_check_expired(const struct rt6_info *rt)
397 if (rt->rt6i_flags & RTF_EXPIRES)
398 return time_after(jiffies, rt->dst.expires);
403 static bool rt6_check_expired(const struct rt6_info *rt)
405 if (rt->rt6i_flags & RTF_EXPIRES) {
406 if (time_after(jiffies, rt->dst.expires))
408 } else if (rt->from) {
409 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
410 fib6_check_expired(rt->from);
415 static struct fib6_info *rt6_multipath_select(const struct net *net,
416 struct fib6_info *match,
417 struct flowi6 *fl6, int oif,
418 const struct sk_buff *skb,
421 struct fib6_info *sibling, *next_sibling;
423 /* We might have already computed the hash for ICMPv6 errors. In such
424 * case it will always be non-zero. Otherwise now is the time to do it.
427 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
429 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
432 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
436 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
437 if (fl6->mp_hash > nh_upper_bound)
439 if (rt6_score_route(sibling, oif, strict) < 0)
449 * Route lookup. rcu_read_lock() should be held.
452 static inline struct fib6_info *rt6_device_match(struct net *net,
453 struct fib6_info *rt,
454 const struct in6_addr *saddr,
458 struct fib6_info *sprt;
460 if (!oif && ipv6_addr_any(saddr) &&
461 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
464 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
465 const struct net_device *dev = sprt->fib6_nh.nh_dev;
467 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
471 if (dev->ifindex == oif)
474 if (ipv6_chk_addr(net, saddr, dev,
475 flags & RT6_LOOKUP_F_IFACE))
480 if (oif && flags & RT6_LOOKUP_F_IFACE)
481 return net->ipv6.fib6_null_entry;
483 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
486 #ifdef CONFIG_IPV6_ROUTER_PREF
487 struct __rt6_probe_work {
488 struct work_struct work;
489 struct in6_addr target;
490 struct net_device *dev;
493 static void rt6_probe_deferred(struct work_struct *w)
495 struct in6_addr mcaddr;
496 struct __rt6_probe_work *work =
497 container_of(w, struct __rt6_probe_work, work);
499 addrconf_addr_solict_mult(&work->target, &mcaddr);
500 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
505 static void rt6_probe(struct fib6_info *rt)
507 struct __rt6_probe_work *work;
508 const struct in6_addr *nh_gw;
509 struct neighbour *neigh;
510 struct net_device *dev;
513 * Okay, this does not seem to be appropriate
514 * for now, however, we need to check if it
515 * is really so; aka Router Reachability Probing.
517 * Router Reachability Probe MUST be rate-limited
518 * to no more than one per minute.
520 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
523 nh_gw = &rt->fib6_nh.nh_gw;
524 dev = rt->fib6_nh.nh_dev;
526 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
528 struct inet6_dev *idev;
530 if (neigh->nud_state & NUD_VALID)
533 idev = __in6_dev_get(dev);
535 write_lock(&neigh->lock);
536 if (!(neigh->nud_state & NUD_VALID) &&
538 neigh->updated + idev->cnf.rtr_probe_interval)) {
539 work = kmalloc(sizeof(*work), GFP_ATOMIC);
541 __neigh_set_probe_once(neigh);
543 write_unlock(&neigh->lock);
545 work = kmalloc(sizeof(*work), GFP_ATOMIC);
549 INIT_WORK(&work->work, rt6_probe_deferred);
550 work->target = *nh_gw;
553 schedule_work(&work->work);
557 rcu_read_unlock_bh();
560 static inline void rt6_probe(struct fib6_info *rt)
566 * Default Router Selection (RFC 2461 6.3.6)
568 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
570 const struct net_device *dev = rt->fib6_nh.nh_dev;
572 if (!oif || dev->ifindex == oif)
577 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
579 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
580 struct neighbour *neigh;
582 if (rt->fib6_flags & RTF_NONEXTHOP ||
583 !(rt->fib6_flags & RTF_GATEWAY))
584 return RT6_NUD_SUCCEED;
587 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
590 read_lock(&neigh->lock);
591 if (neigh->nud_state & NUD_VALID)
592 ret = RT6_NUD_SUCCEED;
593 #ifdef CONFIG_IPV6_ROUTER_PREF
594 else if (!(neigh->nud_state & NUD_FAILED))
595 ret = RT6_NUD_SUCCEED;
597 ret = RT6_NUD_FAIL_PROBE;
599 read_unlock(&neigh->lock);
601 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
602 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
604 rcu_read_unlock_bh();
609 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
613 m = rt6_check_dev(rt, oif);
614 if (!m && (strict & RT6_LOOKUP_F_IFACE))
615 return RT6_NUD_FAIL_HARD;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
619 if (strict & RT6_LOOKUP_F_REACHABLE) {
620 int n = rt6_check_neigh(rt);
627 /* called with rc_read_lock held */
628 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
630 const struct net_device *dev = fib6_info_nh_dev(f6i);
634 const struct inet6_dev *idev = __in6_dev_get(dev);
636 rc = !!idev->cnf.ignore_routes_with_linkdown;
642 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
643 int *mpri, struct fib6_info *match,
647 bool match_do_rr = false;
649 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
652 if (fib6_ignore_linkdown(rt) &&
653 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
654 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
657 if (fib6_check_expired(rt))
660 m = rt6_score_route(rt, oif, strict);
661 if (m == RT6_NUD_FAIL_DO_RR) {
663 m = 0; /* lowest valid score */
664 } else if (m == RT6_NUD_FAIL_HARD) {
668 if (strict & RT6_LOOKUP_F_REACHABLE)
671 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
673 *do_rr = match_do_rr;
681 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
682 struct fib6_info *leaf,
683 struct fib6_info *rr_head,
684 u32 metric, int oif, int strict,
687 struct fib6_info *rt, *match, *cont;
692 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
693 if (rt->fib6_metric != metric) {
698 match = find_match(rt, oif, strict, &mpri, match, do_rr);
701 for (rt = leaf; rt && rt != rr_head;
702 rt = rcu_dereference(rt->rt6_next)) {
703 if (rt->fib6_metric != metric) {
708 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
720 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
723 struct fib6_info *leaf = rcu_dereference(fn->leaf);
724 struct fib6_info *match, *rt0;
728 if (!leaf || leaf == net->ipv6.fib6_null_entry)
729 return net->ipv6.fib6_null_entry;
731 rt0 = rcu_dereference(fn->rr_ptr);
735 /* Double check to make sure fn is not an intermediate node
736 * and fn->leaf does not points to its child's leaf
737 * (This might happen if all routes under fn are deleted from
738 * the tree and fib6_repair_tree() is called on the node.)
740 key_plen = rt0->fib6_dst.plen;
741 #ifdef CONFIG_IPV6_SUBTREES
742 if (rt0->fib6_src.plen)
743 key_plen = rt0->fib6_src.plen;
745 if (fn->fn_bit != key_plen)
746 return net->ipv6.fib6_null_entry;
748 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
752 struct fib6_info *next = rcu_dereference(rt0->rt6_next);
754 /* no entries matched; do round-robin */
755 if (!next || next->fib6_metric != rt0->fib6_metric)
759 spin_lock_bh(&leaf->fib6_table->tb6_lock);
760 /* make sure next is not being deleted from the tree */
762 rcu_assign_pointer(fn->rr_ptr, next);
763 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
767 return match ? match : net->ipv6.fib6_null_entry;
770 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
772 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
775 #ifdef CONFIG_IPV6_ROUTE_INFO
776 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777 const struct in6_addr *gwaddr)
779 struct net *net = dev_net(dev);
780 struct route_info *rinfo = (struct route_info *) opt;
781 struct in6_addr prefix_buf, *prefix;
783 unsigned long lifetime;
784 struct fib6_info *rt;
786 if (len < sizeof(struct route_info)) {
790 /* Sanity check for prefix_len and length */
791 if (rinfo->length > 3) {
793 } else if (rinfo->prefix_len > 128) {
795 } else if (rinfo->prefix_len > 64) {
796 if (rinfo->length < 2) {
799 } else if (rinfo->prefix_len > 0) {
800 if (rinfo->length < 1) {
805 pref = rinfo->route_pref;
806 if (pref == ICMPV6_ROUTER_PREF_INVALID)
809 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
811 if (rinfo->length == 3)
812 prefix = (struct in6_addr *)rinfo->prefix;
814 /* this function is safe */
815 ipv6_addr_prefix(&prefix_buf,
816 (struct in6_addr *)rinfo->prefix,
818 prefix = &prefix_buf;
821 if (rinfo->prefix_len == 0)
822 rt = rt6_get_dflt_router(net, gwaddr, dev);
824 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
827 if (rt && !lifetime) {
833 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
836 rt->fib6_flags = RTF_ROUTEINFO |
837 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
840 if (!addrconf_finite_timeout(lifetime))
841 fib6_clean_expires(rt);
843 fib6_set_expires(rt, jiffies + HZ * lifetime);
845 fib6_info_release(rt);
852 * Misc support functions
855 /* called with rcu_lock held */
856 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
858 struct net_device *dev = rt->fib6_nh.nh_dev;
860 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861 /* for copies of local routes, dst->dev needs to be the
862 * device if it is a master device, the master device if
863 * device is enslaved, and the loopback as the default
865 if (netif_is_l3_slave(dev) &&
866 !rt6_need_strict(&rt->fib6_dst.addr))
867 dev = l3mdev_master_dev_rcu(dev);
868 else if (!netif_is_l3_master(dev))
869 dev = dev_net(dev)->loopback_dev;
870 /* last case is netif_is_l3_master(dev) is true in which
871 * case we want dev returned to be dev
878 static const int fib6_prop[RTN_MAX + 1] = {
885 [RTN_BLACKHOLE] = -EINVAL,
886 [RTN_UNREACHABLE] = -EHOSTUNREACH,
887 [RTN_PROHIBIT] = -EACCES,
888 [RTN_THROW] = -EAGAIN,
890 [RTN_XRESOLVE] = -EINVAL,
893 static int ip6_rt_type_to_error(u8 fib6_type)
895 return fib6_prop[fib6_type];
898 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
900 unsigned short flags = 0;
903 flags |= DST_NOCOUNT;
904 if (rt->dst_nopolicy)
905 flags |= DST_NOPOLICY;
912 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
914 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
916 switch (ort->fib6_type) {
918 rt->dst.output = dst_discard_out;
919 rt->dst.input = dst_discard;
922 rt->dst.output = ip6_pkt_prohibit_out;
923 rt->dst.input = ip6_pkt_prohibit;
926 case RTN_UNREACHABLE:
928 rt->dst.output = ip6_pkt_discard_out;
929 rt->dst.input = ip6_pkt_discard;
934 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
936 rt->dst.flags |= fib6_info_dst_flags(ort);
938 if (ort->fib6_flags & RTF_REJECT) {
939 ip6_rt_init_dst_reject(rt, ort);
944 rt->dst.output = ip6_output;
946 if (ort->fib6_type == RTN_LOCAL) {
947 rt->dst.input = ip6_input;
948 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
949 rt->dst.input = ip6_mc_input;
951 rt->dst.input = ip6_forward;
954 if (ort->fib6_nh.nh_lwtstate) {
955 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
956 lwtunnel_set_redirect(&rt->dst);
959 rt->dst.lastuse = jiffies;
962 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
964 rt->rt6i_flags &= ~RTF_EXPIRES;
965 fib6_info_hold(from);
967 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
968 if (from->fib6_metrics != &dst_default_metrics) {
969 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
970 refcount_inc(&from->fib6_metrics->refcnt);
974 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
976 struct net_device *dev = fib6_info_nh_dev(ort);
978 ip6_rt_init_dst(rt, ort);
980 rt->rt6i_dst = ort->fib6_dst;
981 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
982 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
983 rt->rt6i_flags = ort->fib6_flags;
984 rt6_set_from(rt, ort);
985 #ifdef CONFIG_IPV6_SUBTREES
986 rt->rt6i_src = ort->fib6_src;
988 rt->rt6i_prefsrc = ort->fib6_prefsrc;
989 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
992 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
993 struct in6_addr *saddr)
995 struct fib6_node *pn, *sn;
997 if (fn->fn_flags & RTN_TL_ROOT)
999 pn = rcu_dereference(fn->parent);
1000 sn = FIB6_SUBTREE(pn);
1002 fn = fib6_lookup(sn, NULL, saddr);
1005 if (fn->fn_flags & RTN_RTINFO)
1010 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1013 struct rt6_info *rt = *prt;
1015 if (dst_hold_safe(&rt->dst))
1017 if (null_fallback) {
1018 rt = net->ipv6.ip6_null_entry;
1027 /* called with rcu_lock held */
1028 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1030 unsigned short flags = fib6_info_dst_flags(rt);
1031 struct net_device *dev = rt->fib6_nh.nh_dev;
1032 struct rt6_info *nrt;
1034 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1036 ip6_rt_copy_init(nrt, rt);
1041 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1042 struct fib6_table *table,
1044 const struct sk_buff *skb,
1047 struct fib6_info *f6i;
1048 struct fib6_node *fn;
1049 struct rt6_info *rt;
1051 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1052 flags &= ~RT6_LOOKUP_F_IFACE;
1055 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1057 f6i = rcu_dereference(fn->leaf);
1059 f6i = net->ipv6.fib6_null_entry;
1061 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1062 fl6->flowi6_oif, flags);
1063 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1064 f6i = rt6_multipath_select(net, f6i, fl6,
1065 fl6->flowi6_oif, skb, flags);
1067 if (f6i == net->ipv6.fib6_null_entry) {
1068 fn = fib6_backtrack(fn, &fl6->saddr);
1073 /* Search through exception table */
1074 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1076 if (ip6_hold_safe(net, &rt, true))
1077 dst_use_noref(&rt->dst, jiffies);
1078 } else if (f6i == net->ipv6.fib6_null_entry) {
1079 rt = net->ipv6.ip6_null_entry;
1082 rt = ip6_create_rt_rcu(f6i);
1084 rt = net->ipv6.ip6_null_entry;
1091 trace_fib6_table_lookup(net, rt, table, fl6);
1096 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1097 const struct sk_buff *skb, int flags)
1099 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1101 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1103 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1104 const struct in6_addr *saddr, int oif,
1105 const struct sk_buff *skb, int strict)
1107 struct flowi6 fl6 = {
1111 struct dst_entry *dst;
1112 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1115 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1116 flags |= RT6_LOOKUP_F_HAS_SADDR;
1119 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1120 if (dst->error == 0)
1121 return (struct rt6_info *) dst;
1127 EXPORT_SYMBOL(rt6_lookup);
1129 /* ip6_ins_rt is called with FREE table->tb6_lock.
1130 * It takes new route entry, the addition fails by any reason the
1131 * route is released.
1132 * Caller must hold dst before calling it.
1135 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1136 struct netlink_ext_ack *extack)
1139 struct fib6_table *table;
1141 table = rt->fib6_table;
1142 spin_lock_bh(&table->tb6_lock);
1143 err = fib6_add(&table->tb6_root, rt, info, extack);
1144 spin_unlock_bh(&table->tb6_lock);
1149 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1151 struct nl_info info = { .nl_net = net, };
1153 return __ip6_ins_rt(rt, &info, NULL);
1156 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1157 const struct in6_addr *daddr,
1158 const struct in6_addr *saddr)
1160 struct net_device *dev;
1161 struct rt6_info *rt;
1168 dev = ip6_rt_get_dev_rcu(ort);
1169 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1174 ip6_rt_copy_init(rt, ort);
1175 rt->rt6i_flags |= RTF_CACHE;
1176 rt->dst.flags |= DST_HOST;
1177 rt->rt6i_dst.addr = *daddr;
1178 rt->rt6i_dst.plen = 128;
1180 if (!rt6_is_gw_or_nonexthop(ort)) {
1181 if (ort->fib6_dst.plen != 128 &&
1182 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1183 rt->rt6i_flags |= RTF_ANYCAST;
1184 #ifdef CONFIG_IPV6_SUBTREES
1185 if (rt->rt6i_src.plen && saddr) {
1186 rt->rt6i_src.addr = *saddr;
1187 rt->rt6i_src.plen = 128;
1195 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1197 unsigned short flags = fib6_info_dst_flags(rt);
1198 struct net_device *dev;
1199 struct rt6_info *pcpu_rt;
1202 dev = ip6_rt_get_dev_rcu(rt);
1203 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1207 ip6_rt_copy_init(pcpu_rt, rt);
1208 pcpu_rt->rt6i_flags |= RTF_PCPU;
1212 /* It should be called with rcu_read_lock() acquired */
1213 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1215 struct rt6_info *pcpu_rt, **p;
1217 p = this_cpu_ptr(rt->rt6i_pcpu);
1221 ip6_hold_safe(NULL, &pcpu_rt, false);
1226 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1227 struct fib6_info *rt)
1229 struct rt6_info *pcpu_rt, *prev, **p;
1231 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1233 dst_hold(&net->ipv6.ip6_null_entry->dst);
1234 return net->ipv6.ip6_null_entry;
1237 dst_hold(&pcpu_rt->dst);
1238 p = this_cpu_ptr(rt->rt6i_pcpu);
1239 prev = cmpxchg(p, NULL, pcpu_rt);
1245 /* exception hash table implementation
1247 static DEFINE_SPINLOCK(rt6_exception_lock);
1249 /* Remove rt6_ex from hash table and free the memory
1250 * Caller must hold rt6_exception_lock
1252 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1253 struct rt6_exception *rt6_ex)
1257 if (!bucket || !rt6_ex)
1260 net = dev_net(rt6_ex->rt6i->dst.dev);
1261 hlist_del_rcu(&rt6_ex->hlist);
1262 dst_release(&rt6_ex->rt6i->dst);
1263 kfree_rcu(rt6_ex, rcu);
1264 WARN_ON_ONCE(!bucket->depth);
1266 net->ipv6.rt6_stats->fib_rt_cache--;
1269 /* Remove oldest rt6_ex in bucket and free the memory
1270 * Caller must hold rt6_exception_lock
1272 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1274 struct rt6_exception *rt6_ex, *oldest = NULL;
1279 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1280 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1283 rt6_remove_exception(bucket, oldest);
1286 static u32 rt6_exception_hash(const struct in6_addr *dst,
1287 const struct in6_addr *src)
1289 static u32 seed __read_mostly;
1292 net_get_random_once(&seed, sizeof(seed));
1293 val = jhash(dst, sizeof(*dst), seed);
1295 #ifdef CONFIG_IPV6_SUBTREES
1297 val = jhash(src, sizeof(*src), val);
1299 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1302 /* Helper function to find the cached rt in the hash table
1303 * and update bucket pointer to point to the bucket for this
1304 * (daddr, saddr) pair
1305 * Caller must hold rt6_exception_lock
1307 static struct rt6_exception *
1308 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1309 const struct in6_addr *daddr,
1310 const struct in6_addr *saddr)
1312 struct rt6_exception *rt6_ex;
1315 if (!(*bucket) || !daddr)
1318 hval = rt6_exception_hash(daddr, saddr);
1321 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1322 struct rt6_info *rt6 = rt6_ex->rt6i;
1323 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1325 #ifdef CONFIG_IPV6_SUBTREES
1326 if (matched && saddr)
1327 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1335 /* Helper function to find the cached rt in the hash table
1336 * and update bucket pointer to point to the bucket for this
1337 * (daddr, saddr) pair
1338 * Caller must hold rcu_read_lock()
1340 static struct rt6_exception *
1341 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1342 const struct in6_addr *daddr,
1343 const struct in6_addr *saddr)
1345 struct rt6_exception *rt6_ex;
1348 WARN_ON_ONCE(!rcu_read_lock_held());
1350 if (!(*bucket) || !daddr)
1353 hval = rt6_exception_hash(daddr, saddr);
1356 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1357 struct rt6_info *rt6 = rt6_ex->rt6i;
1358 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360 #ifdef CONFIG_IPV6_SUBTREES
1361 if (matched && saddr)
1362 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1370 static unsigned int fib6_mtu(const struct fib6_info *rt)
1374 if (rt->fib6_pmtu) {
1375 mtu = rt->fib6_pmtu;
1377 struct net_device *dev = fib6_info_nh_dev(rt);
1378 struct inet6_dev *idev;
1381 idev = __in6_dev_get(dev);
1382 mtu = idev->cnf.mtu6;
1386 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1388 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1391 static int rt6_insert_exception(struct rt6_info *nrt,
1392 struct fib6_info *ort)
1394 struct net *net = dev_net(nrt->dst.dev);
1395 struct rt6_exception_bucket *bucket;
1396 struct in6_addr *src_key = NULL;
1397 struct rt6_exception *rt6_ex;
1400 spin_lock_bh(&rt6_exception_lock);
1402 if (ort->exception_bucket_flushed) {
1407 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1408 lockdep_is_held(&rt6_exception_lock));
1410 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1416 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1419 #ifdef CONFIG_IPV6_SUBTREES
1420 /* rt6i_src.plen != 0 indicates ort is in subtree
1421 * and exception table is indexed by a hash of
1422 * both rt6i_dst and rt6i_src.
1423 * Otherwise, the exception table is indexed by
1424 * a hash of only rt6i_dst.
1426 if (ort->fib6_src.plen)
1427 src_key = &nrt->rt6i_src.addr;
1430 /* Update rt6i_prefsrc as it could be changed
1431 * in rt6_remove_prefsrc()
1433 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1434 /* rt6_mtu_change() might lower mtu on ort.
1435 * Only insert this exception route if its mtu
1436 * is less than ort's mtu value.
1438 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1443 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1446 rt6_remove_exception(bucket, rt6_ex);
1448 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1454 rt6_ex->stamp = jiffies;
1455 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1457 net->ipv6.rt6_stats->fib_rt_cache++;
1459 if (bucket->depth > FIB6_MAX_DEPTH)
1460 rt6_exception_remove_oldest(bucket);
1463 spin_unlock_bh(&rt6_exception_lock);
1465 /* Update fn->fn_sernum to invalidate all cached dst */
1467 spin_lock_bh(&ort->fib6_table->tb6_lock);
1468 fib6_update_sernum(net, ort);
1469 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1470 fib6_force_start_gc(net);
1476 void rt6_flush_exceptions(struct fib6_info *rt)
1478 struct rt6_exception_bucket *bucket;
1479 struct rt6_exception *rt6_ex;
1480 struct hlist_node *tmp;
1483 spin_lock_bh(&rt6_exception_lock);
1484 /* Prevent rt6_insert_exception() to recreate the bucket list */
1485 rt->exception_bucket_flushed = 1;
1487 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1488 lockdep_is_held(&rt6_exception_lock));
1492 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1493 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1494 rt6_remove_exception(bucket, rt6_ex);
1495 WARN_ON_ONCE(bucket->depth);
1500 spin_unlock_bh(&rt6_exception_lock);
1503 /* Find cached rt in the hash table inside passed in rt
1504 * Caller has to hold rcu_read_lock()
1506 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1507 struct in6_addr *daddr,
1508 struct in6_addr *saddr)
1510 struct rt6_exception_bucket *bucket;
1511 struct in6_addr *src_key = NULL;
1512 struct rt6_exception *rt6_ex;
1513 struct rt6_info *res = NULL;
1515 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1517 #ifdef CONFIG_IPV6_SUBTREES
1518 /* rt6i_src.plen != 0 indicates rt is in subtree
1519 * and exception table is indexed by a hash of
1520 * both rt6i_dst and rt6i_src.
1521 * Otherwise, the exception table is indexed by
1522 * a hash of only rt6i_dst.
1524 if (rt->fib6_src.plen)
1527 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1529 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1535 /* Remove the passed in cached rt from the hash table that contains it */
1536 static int rt6_remove_exception_rt(struct rt6_info *rt)
1538 struct rt6_exception_bucket *bucket;
1539 struct fib6_info *from = rt->from;
1540 struct in6_addr *src_key = NULL;
1541 struct rt6_exception *rt6_ex;
1545 !(rt->rt6i_flags & RTF_CACHE))
1548 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1551 spin_lock_bh(&rt6_exception_lock);
1552 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1553 lockdep_is_held(&rt6_exception_lock));
1554 #ifdef CONFIG_IPV6_SUBTREES
1555 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1556 * and exception table is indexed by a hash of
1557 * both rt6i_dst and rt6i_src.
1558 * Otherwise, the exception table is indexed by
1559 * a hash of only rt6i_dst.
1561 if (from->fib6_src.plen)
1562 src_key = &rt->rt6i_src.addr;
1564 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1568 rt6_remove_exception(bucket, rt6_ex);
1574 spin_unlock_bh(&rt6_exception_lock);
1578 /* Find rt6_ex which contains the passed in rt cache and
1581 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1583 struct rt6_exception_bucket *bucket;
1584 struct fib6_info *from = rt->from;
1585 struct in6_addr *src_key = NULL;
1586 struct rt6_exception *rt6_ex;
1589 !(rt->rt6i_flags & RTF_CACHE))
1593 bucket = rcu_dereference(from->rt6i_exception_bucket);
1595 #ifdef CONFIG_IPV6_SUBTREES
1596 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1597 * and exception table is indexed by a hash of
1598 * both rt6i_dst and rt6i_src.
1599 * Otherwise, the exception table is indexed by
1600 * a hash of only rt6i_dst.
1602 if (from->fib6_src.plen)
1603 src_key = &rt->rt6i_src.addr;
1605 rt6_ex = __rt6_find_exception_rcu(&bucket,
1609 rt6_ex->stamp = jiffies;
1614 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1616 struct rt6_exception_bucket *bucket;
1617 struct rt6_exception *rt6_ex;
1620 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1621 lockdep_is_held(&rt6_exception_lock));
1624 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1625 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1626 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634 struct rt6_info *rt, int mtu)
1636 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637 * lowest MTU in the path: always allow updating the route PMTU to
1638 * reflect PMTU decreases.
1640 * If the new MTU is higher, and the route PMTU is equal to the local
1641 * MTU, this means the old MTU is the lowest in the path, so allow
1642 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1646 if (dst_mtu(&rt->dst) >= mtu)
1649 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656 struct fib6_info *rt, int mtu)
1658 struct rt6_exception_bucket *bucket;
1659 struct rt6_exception *rt6_ex;
1662 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663 lockdep_is_held(&rt6_exception_lock));
1668 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670 struct rt6_info *entry = rt6_ex->rt6i;
1672 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673 * route), the metrics of its rt->from have already
1676 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677 rt6_mtu_change_route_allowed(idev, entry, mtu))
1678 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1684 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687 struct in6_addr *gateway)
1689 struct rt6_exception_bucket *bucket;
1690 struct rt6_exception *rt6_ex;
1691 struct hlist_node *tmp;
1694 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1697 spin_lock_bh(&rt6_exception_lock);
1698 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699 lockdep_is_held(&rt6_exception_lock));
1702 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703 hlist_for_each_entry_safe(rt6_ex, tmp,
1704 &bucket->chain, hlist) {
1705 struct rt6_info *entry = rt6_ex->rt6i;
1707 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708 RTF_CACHE_GATEWAY &&
1709 ipv6_addr_equal(gateway,
1710 &entry->rt6i_gateway)) {
1711 rt6_remove_exception(bucket, rt6_ex);
1718 spin_unlock_bh(&rt6_exception_lock);
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722 struct rt6_exception *rt6_ex,
1723 struct fib6_gc_args *gc_args,
1726 struct rt6_info *rt = rt6_ex->rt6i;
1728 /* we are pruning and obsoleting aged-out and non gateway exceptions
1729 * even if others have still references to them, so that on next
1730 * dst_check() such references can be dropped.
1731 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732 * expired, independently from their aging, as per RFC 8201 section 4
1734 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736 RT6_TRACE("aging clone %p\n", rt);
1737 rt6_remove_exception(bucket, rt6_ex);
1740 } else if (time_after(jiffies, rt->dst.expires)) {
1741 RT6_TRACE("purging expired route %p\n", rt);
1742 rt6_remove_exception(bucket, rt6_ex);
1746 if (rt->rt6i_flags & RTF_GATEWAY) {
1747 struct neighbour *neigh;
1748 __u8 neigh_flags = 0;
1750 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1752 neigh_flags = neigh->flags;
1754 if (!(neigh_flags & NTF_ROUTER)) {
1755 RT6_TRACE("purging route %p via non-router but gateway\n",
1757 rt6_remove_exception(bucket, rt6_ex);
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766 struct fib6_gc_args *gc_args,
1769 struct rt6_exception_bucket *bucket;
1770 struct rt6_exception *rt6_ex;
1771 struct hlist_node *tmp;
1774 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1778 spin_lock(&rt6_exception_lock);
1779 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780 lockdep_is_held(&rt6_exception_lock));
1783 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784 hlist_for_each_entry_safe(rt6_ex, tmp,
1785 &bucket->chain, hlist) {
1786 rt6_age_examine_exception(bucket, rt6_ex,
1792 spin_unlock(&rt6_exception_lock);
1793 rcu_read_unlock_bh();
1796 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1797 int oif, struct flowi6 *fl6,
1798 const struct sk_buff *skb, int flags)
1800 struct fib6_node *fn, *saved_fn;
1801 struct fib6_info *f6i;
1802 struct rt6_info *rt;
1805 strict |= flags & RT6_LOOKUP_F_IFACE;
1806 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1807 if (net->ipv6.devconf_all->forwarding == 0)
1808 strict |= RT6_LOOKUP_F_REACHABLE;
1812 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1815 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1819 f6i = rt6_select(net, fn, oif, strict);
1820 if (f6i->fib6_nsiblings)
1821 f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1822 if (f6i == net->ipv6.fib6_null_entry) {
1823 fn = fib6_backtrack(fn, &fl6->saddr);
1825 goto redo_rt6_select;
1826 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1827 /* also consider unreachable route */
1828 strict &= ~RT6_LOOKUP_F_REACHABLE;
1830 goto redo_rt6_select;
1834 if (f6i == net->ipv6.fib6_null_entry) {
1835 rt = net->ipv6.ip6_null_entry;
1838 trace_fib6_table_lookup(net, rt, table, fl6);
1842 /*Search through exception table */
1843 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1845 if (ip6_hold_safe(net, &rt, true))
1846 dst_use_noref(&rt->dst, jiffies);
1849 trace_fib6_table_lookup(net, rt, table, fl6);
1851 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1852 !(f6i->fib6_flags & RTF_GATEWAY))) {
1853 /* Create a RTF_CACHE clone which will not be
1854 * owned by the fib6 tree. It is for the special case where
1855 * the daddr in the skb during the neighbor look-up is different
1856 * from the fl6->daddr used to look-up route here.
1859 struct rt6_info *uncached_rt;
1861 fib6_info_hold(f6i);
1864 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1865 fib6_info_release(f6i);
1868 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1869 * No need for another dst_hold()
1871 rt6_uncached_list_add(uncached_rt);
1872 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1874 uncached_rt = net->ipv6.ip6_null_entry;
1875 dst_hold(&uncached_rt->dst);
1878 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1882 /* Get a percpu copy */
1884 struct rt6_info *pcpu_rt;
1887 pcpu_rt = rt6_get_pcpu_route(f6i);
1890 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1898 EXPORT_SYMBOL_GPL(ip6_pol_route);
1900 static struct rt6_info *ip6_pol_route_input(struct net *net,
1901 struct fib6_table *table,
1903 const struct sk_buff *skb,
1906 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1909 struct dst_entry *ip6_route_input_lookup(struct net *net,
1910 struct net_device *dev,
1912 const struct sk_buff *skb,
1915 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1916 flags |= RT6_LOOKUP_F_IFACE;
1918 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1920 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1922 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1923 struct flow_keys *keys,
1924 struct flow_keys *flkeys)
1926 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1927 const struct ipv6hdr *key_iph = outer_iph;
1928 struct flow_keys *_flkeys = flkeys;
1929 const struct ipv6hdr *inner_iph;
1930 const struct icmp6hdr *icmph;
1931 struct ipv6hdr _inner_iph;
1933 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1936 icmph = icmp6_hdr(skb);
1937 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1938 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1939 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1940 icmph->icmp6_type != ICMPV6_PARAMPROB)
1943 inner_iph = skb_header_pointer(skb,
1944 skb_transport_offset(skb) + sizeof(*icmph),
1945 sizeof(_inner_iph), &_inner_iph);
1949 key_iph = inner_iph;
1953 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1954 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1955 keys->tags.flow_label = _flkeys->tags.flow_label;
1956 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1958 keys->addrs.v6addrs.src = key_iph->saddr;
1959 keys->addrs.v6addrs.dst = key_iph->daddr;
1960 keys->tags.flow_label = ip6_flowinfo(key_iph);
1961 keys->basic.ip_proto = key_iph->nexthdr;
1965 /* if skb is set it will be used and fl6 can be NULL */
1966 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1967 const struct sk_buff *skb, struct flow_keys *flkeys)
1969 struct flow_keys hash_keys;
1972 switch (ip6_multipath_hash_policy(net)) {
1974 memset(&hash_keys, 0, sizeof(hash_keys));
1975 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1977 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1979 hash_keys.addrs.v6addrs.src = fl6->saddr;
1980 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1981 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1982 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1987 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1988 struct flow_keys keys;
1990 /* short-circuit if we already have L4 hash present */
1992 return skb_get_hash_raw(skb) >> 1;
1994 memset(&hash_keys, 0, sizeof(hash_keys));
1997 skb_flow_dissect_flow_keys(skb, &keys, flag);
2000 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2002 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2003 hash_keys.ports.src = flkeys->ports.src;
2004 hash_keys.ports.dst = flkeys->ports.dst;
2005 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2007 memset(&hash_keys, 0, sizeof(hash_keys));
2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009 hash_keys.addrs.v6addrs.src = fl6->saddr;
2010 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2011 hash_keys.ports.src = fl6->fl6_sport;
2012 hash_keys.ports.dst = fl6->fl6_dport;
2013 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2017 mhash = flow_hash_from_keys(&hash_keys);
2022 void ip6_route_input(struct sk_buff *skb)
2024 const struct ipv6hdr *iph = ipv6_hdr(skb);
2025 struct net *net = dev_net(skb->dev);
2026 int flags = RT6_LOOKUP_F_HAS_SADDR;
2027 struct ip_tunnel_info *tun_info;
2028 struct flowi6 fl6 = {
2029 .flowi6_iif = skb->dev->ifindex,
2030 .daddr = iph->daddr,
2031 .saddr = iph->saddr,
2032 .flowlabel = ip6_flowinfo(iph),
2033 .flowi6_mark = skb->mark,
2034 .flowi6_proto = iph->nexthdr,
2036 struct flow_keys *flkeys = NULL, _flkeys;
2038 tun_info = skb_tunnel_info(skb);
2039 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2040 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2042 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2045 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2046 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2049 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2052 static struct rt6_info *ip6_pol_route_output(struct net *net,
2053 struct fib6_table *table,
2055 const struct sk_buff *skb,
2058 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2061 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2062 struct flowi6 *fl6, int flags)
2066 if (rt6_need_strict(&fl6->daddr)) {
2067 struct dst_entry *dst;
2069 dst = l3mdev_link_scope_lookup(net, fl6);
2074 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2076 any_src = ipv6_addr_any(&fl6->saddr);
2077 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2078 (fl6->flowi6_oif && any_src))
2079 flags |= RT6_LOOKUP_F_IFACE;
2082 flags |= RT6_LOOKUP_F_HAS_SADDR;
2084 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2086 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2088 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2090 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2092 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2093 struct net_device *loopback_dev = net->loopback_dev;
2094 struct dst_entry *new = NULL;
2096 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2097 DST_OBSOLETE_DEAD, 0);
2100 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2104 new->input = dst_discard;
2105 new->output = dst_discard_out;
2107 dst_copy_metrics(new, &ort->dst);
2109 rt->rt6i_idev = in6_dev_get(loopback_dev);
2110 rt->rt6i_gateway = ort->rt6i_gateway;
2111 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2113 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2114 #ifdef CONFIG_IPV6_SUBTREES
2115 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2119 dst_release(dst_orig);
2120 return new ? new : ERR_PTR(-ENOMEM);
2124 * Destination cache support functions
2127 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2131 if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) ||
2132 rt_cookie != cookie)
2135 if (fib6_check_expired(f6i))
2141 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2145 if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
2146 rt_cookie != cookie)
2149 if (rt6_check_expired(rt))
2155 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2157 if (!__rt6_check_expired(rt) &&
2158 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2159 fib6_check(rt->from, cookie))
2165 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2167 struct rt6_info *rt;
2169 rt = (struct rt6_info *) dst;
2171 /* All IPV6 dsts are created with ->obsolete set to the value
2172 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2173 * into this function always.
2176 if (rt->rt6i_flags & RTF_PCPU ||
2177 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2178 return rt6_dst_from_check(rt, cookie);
2180 return rt6_check(rt, cookie);
2183 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2185 struct rt6_info *rt = (struct rt6_info *) dst;
2188 if (rt->rt6i_flags & RTF_CACHE) {
2189 if (rt6_check_expired(rt)) {
2190 rt6_remove_exception_rt(rt);
2201 static void ip6_link_failure(struct sk_buff *skb)
2203 struct rt6_info *rt;
2205 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2207 rt = (struct rt6_info *) skb_dst(skb);
2209 if (rt->rt6i_flags & RTF_CACHE) {
2210 if (dst_hold_safe(&rt->dst))
2211 rt6_remove_exception_rt(rt);
2212 } else if (rt->from) {
2213 struct fib6_node *fn;
2216 fn = rcu_dereference(rt->from->fib6_node);
2217 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2224 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2226 if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
2227 rt0->dst.expires = rt0->from->expires;
2229 dst_set_expires(&rt0->dst, timeout);
2230 rt0->rt6i_flags |= RTF_EXPIRES;
2233 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2235 struct net *net = dev_net(rt->dst.dev);
2237 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2238 rt->rt6i_flags |= RTF_MODIFIED;
2239 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2242 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2244 return !(rt->rt6i_flags & RTF_CACHE) &&
2245 (rt->rt6i_flags & RTF_PCPU || rt->from);
2248 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2249 const struct ipv6hdr *iph, u32 mtu)
2251 const struct in6_addr *daddr, *saddr;
2252 struct rt6_info *rt6 = (struct rt6_info *)dst;
2254 if (rt6->rt6i_flags & RTF_LOCAL)
2257 if (dst_metric_locked(dst, RTAX_MTU))
2261 daddr = &iph->daddr;
2262 saddr = &iph->saddr;
2264 daddr = &sk->sk_v6_daddr;
2265 saddr = &inet6_sk(sk)->saddr;
2270 dst_confirm_neigh(dst, daddr);
2271 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2272 if (mtu >= dst_mtu(dst))
2275 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2276 rt6_do_update_pmtu(rt6, mtu);
2277 /* update rt6_ex->stamp for cache */
2278 if (rt6->rt6i_flags & RTF_CACHE)
2279 rt6_update_exception_stamp_rt(rt6);
2281 struct rt6_info *nrt6;
2283 nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2285 rt6_do_update_pmtu(nrt6, mtu);
2286 if (rt6_insert_exception(nrt6, rt6->from))
2287 dst_release_immediate(&nrt6->dst);
2292 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2293 struct sk_buff *skb, u32 mtu)
2295 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2298 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2299 int oif, u32 mark, kuid_t uid)
2301 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2302 struct dst_entry *dst;
2305 memset(&fl6, 0, sizeof(fl6));
2306 fl6.flowi6_oif = oif;
2307 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2308 fl6.daddr = iph->daddr;
2309 fl6.saddr = iph->saddr;
2310 fl6.flowlabel = ip6_flowinfo(iph);
2311 fl6.flowi6_uid = uid;
2313 dst = ip6_route_output(net, NULL, &fl6);
2315 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2318 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2320 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2322 struct dst_entry *dst;
2324 ip6_update_pmtu(skb, sock_net(sk), mtu,
2325 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2327 dst = __sk_dst_get(sk);
2328 if (!dst || !dst->obsolete ||
2329 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2333 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2334 ip6_datagram_dst_update(sk, false);
2337 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2339 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2340 const struct flowi6 *fl6)
2342 #ifdef CONFIG_IPV6_SUBTREES
2343 struct ipv6_pinfo *np = inet6_sk(sk);
2346 ip6_dst_store(sk, dst,
2347 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2348 &sk->sk_v6_daddr : NULL,
2349 #ifdef CONFIG_IPV6_SUBTREES
2350 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2356 /* Handle redirects */
2357 struct ip6rd_flowi {
2359 struct in6_addr gateway;
2362 static struct rt6_info *__ip6_route_redirect(struct net *net,
2363 struct fib6_table *table,
2365 const struct sk_buff *skb,
2368 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2369 struct rt6_info *ret = NULL, *rt_cache;
2370 struct fib6_info *rt;
2371 struct fib6_node *fn;
2373 /* Get the "current" route for this destination and
2374 * check if the redirect has come from appropriate router.
2376 * RFC 4861 specifies that redirects should only be
2377 * accepted if they come from the nexthop to the target.
2378 * Due to the way the routes are chosen, this notion
2379 * is a bit fuzzy and one might need to check all possible
2384 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2386 for_each_fib6_node_rt_rcu(fn) {
2387 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2389 if (fib6_check_expired(rt))
2391 if (rt->fib6_flags & RTF_REJECT)
2393 if (!(rt->fib6_flags & RTF_GATEWAY))
2395 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2397 /* rt_cache's gateway might be different from its 'parent'
2398 * in the case of an ip redirect.
2399 * So we keep searching in the exception table if the gateway
2402 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2403 rt_cache = rt6_find_cached_rt(rt,
2407 ipv6_addr_equal(&rdfl->gateway,
2408 &rt_cache->rt6i_gateway)) {
2418 rt = net->ipv6.fib6_null_entry;
2419 else if (rt->fib6_flags & RTF_REJECT) {
2420 ret = net->ipv6.ip6_null_entry;
2424 if (rt == net->ipv6.fib6_null_entry) {
2425 fn = fib6_backtrack(fn, &fl6->saddr);
2432 dst_hold(&ret->dst);
2434 ret = ip6_create_rt_rcu(rt);
2438 trace_fib6_table_lookup(net, ret, table, fl6);
2442 static struct dst_entry *ip6_route_redirect(struct net *net,
2443 const struct flowi6 *fl6,
2444 const struct sk_buff *skb,
2445 const struct in6_addr *gateway)
2447 int flags = RT6_LOOKUP_F_HAS_SADDR;
2448 struct ip6rd_flowi rdfl;
2451 rdfl.gateway = *gateway;
2453 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2454 flags, __ip6_route_redirect);
2457 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2460 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2461 struct dst_entry *dst;
2464 memset(&fl6, 0, sizeof(fl6));
2465 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2466 fl6.flowi6_oif = oif;
2467 fl6.flowi6_mark = mark;
2468 fl6.daddr = iph->daddr;
2469 fl6.saddr = iph->saddr;
2470 fl6.flowlabel = ip6_flowinfo(iph);
2471 fl6.flowi6_uid = uid;
2473 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2474 rt6_do_redirect(dst, NULL, skb);
2477 EXPORT_SYMBOL_GPL(ip6_redirect);
2479 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2482 const struct ipv6hdr *iph = ipv6_hdr(skb);
2483 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2484 struct dst_entry *dst;
2487 memset(&fl6, 0, sizeof(fl6));
2488 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2489 fl6.flowi6_oif = oif;
2490 fl6.flowi6_mark = mark;
2491 fl6.daddr = msg->dest;
2492 fl6.saddr = iph->daddr;
2493 fl6.flowi6_uid = sock_net_uid(net, NULL);
2495 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2496 rt6_do_redirect(dst, NULL, skb);
2500 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2502 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2505 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2507 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2509 struct net_device *dev = dst->dev;
2510 unsigned int mtu = dst_mtu(dst);
2511 struct net *net = dev_net(dev);
2513 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2515 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2516 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2519 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2520 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2521 * IPV6_MAXPLEN is also valid and means: "any MSS,
2522 * rely only on pmtu discovery"
2524 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2529 static unsigned int ip6_mtu(const struct dst_entry *dst)
2531 struct inet6_dev *idev;
2534 mtu = dst_metric_raw(dst, RTAX_MTU);
2541 idev = __in6_dev_get(dst->dev);
2543 mtu = idev->cnf.mtu6;
2547 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2549 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2552 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2555 struct dst_entry *dst;
2556 struct rt6_info *rt;
2557 struct inet6_dev *idev = in6_dev_get(dev);
2558 struct net *net = dev_net(dev);
2560 if (unlikely(!idev))
2561 return ERR_PTR(-ENODEV);
2563 rt = ip6_dst_alloc(net, dev, 0);
2564 if (unlikely(!rt)) {
2566 dst = ERR_PTR(-ENOMEM);
2570 rt->dst.flags |= DST_HOST;
2571 rt->dst.input = ip6_input;
2572 rt->dst.output = ip6_output;
2573 rt->rt6i_gateway = fl6->daddr;
2574 rt->rt6i_dst.addr = fl6->daddr;
2575 rt->rt6i_dst.plen = 128;
2576 rt->rt6i_idev = idev;
2577 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2579 /* Add this dst into uncached_list so that rt6_disable_ip() can
2580 * do proper release of the net_device
2582 rt6_uncached_list_add(rt);
2583 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2585 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2591 static int ip6_dst_gc(struct dst_ops *ops)
2593 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2594 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2595 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2596 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2597 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2598 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2601 entries = dst_entries_get_fast(ops);
2602 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2603 entries <= rt_max_size)
2606 net->ipv6.ip6_rt_gc_expire++;
2607 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2608 entries = dst_entries_get_slow(ops);
2609 if (entries < ops->gc_thresh)
2610 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2612 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2613 return entries > rt_max_size;
2616 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2617 struct fib6_config *cfg)
2619 struct dst_metrics *p;
2624 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2628 refcount_set(&p->refcnt, 1);
2629 rt->fib6_metrics = p;
2631 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2634 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2635 struct fib6_config *cfg,
2636 const struct in6_addr *gw_addr,
2637 u32 tbid, int flags)
2639 struct flowi6 fl6 = {
2640 .flowi6_oif = cfg->fc_ifindex,
2642 .saddr = cfg->fc_prefsrc,
2644 struct fib6_table *table;
2645 struct rt6_info *rt;
2647 table = fib6_get_table(net, tbid);
2651 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2652 flags |= RT6_LOOKUP_F_HAS_SADDR;
2654 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2655 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2657 /* if table lookup failed, fall back to full lookup */
2658 if (rt == net->ipv6.ip6_null_entry) {
2666 static int ip6_route_check_nh_onlink(struct net *net,
2667 struct fib6_config *cfg,
2668 const struct net_device *dev,
2669 struct netlink_ext_ack *extack)
2671 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2672 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2673 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2674 struct rt6_info *grt;
2678 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2680 if (!grt->dst.error &&
2681 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2682 NL_SET_ERR_MSG(extack,
2683 "Nexthop has invalid gateway or device mismatch");
2693 static int ip6_route_check_nh(struct net *net,
2694 struct fib6_config *cfg,
2695 struct net_device **_dev,
2696 struct inet6_dev **idev)
2698 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2699 struct net_device *dev = _dev ? *_dev : NULL;
2700 struct rt6_info *grt = NULL;
2701 int err = -EHOSTUNREACH;
2703 if (cfg->fc_table) {
2704 int flags = RT6_LOOKUP_F_IFACE;
2706 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2707 cfg->fc_table, flags);
2709 if (grt->rt6i_flags & RTF_GATEWAY ||
2710 (dev && dev != grt->dst.dev)) {
2718 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2724 if (dev != grt->dst.dev) {
2729 *_dev = dev = grt->dst.dev;
2730 *idev = grt->rt6i_idev;
2732 in6_dev_hold(grt->rt6i_idev);
2735 if (!(grt->rt6i_flags & RTF_GATEWAY))
2744 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2745 struct net_device **_dev, struct inet6_dev **idev,
2746 struct netlink_ext_ack *extack)
2748 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2749 int gwa_type = ipv6_addr_type(gw_addr);
2750 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2751 const struct net_device *dev = *_dev;
2752 bool need_addr_check = !dev;
2755 /* if gw_addr is local we will fail to detect this in case
2756 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2757 * will return already-added prefix route via interface that
2758 * prefix route was assigned to, which might be non-loopback.
2761 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2762 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2766 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2767 /* IPv6 strictly inhibits using not link-local
2768 * addresses as nexthop address.
2769 * Otherwise, router will not able to send redirects.
2770 * It is very good, but in some (rare!) circumstances
2771 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2772 * some exceptions. --ANK
2773 * We allow IPv4-mapped nexthops to support RFC4798-type
2776 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2777 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2781 if (cfg->fc_flags & RTNH_F_ONLINK)
2782 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2784 err = ip6_route_check_nh(net, cfg, _dev, idev);
2790 /* reload in case device was changed */
2795 NL_SET_ERR_MSG(extack, "Egress device not specified");
2797 } else if (dev->flags & IFF_LOOPBACK) {
2798 NL_SET_ERR_MSG(extack,
2799 "Egress device can not be loopback device for this route");
2803 /* if we did not check gw_addr above, do so now that the
2804 * egress device has been resolved.
2806 if (need_addr_check &&
2807 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2808 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2817 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2819 struct netlink_ext_ack *extack)
2821 struct net *net = cfg->fc_nlinfo.nl_net;
2822 struct fib6_info *rt = NULL;
2823 struct net_device *dev = NULL;
2824 struct inet6_dev *idev = NULL;
2825 struct fib6_table *table;
2829 /* RTF_PCPU is an internal flag; can not be set by userspace */
2830 if (cfg->fc_flags & RTF_PCPU) {
2831 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2835 /* RTF_CACHE is an internal flag; can not be set by userspace */
2836 if (cfg->fc_flags & RTF_CACHE) {
2837 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2841 if (cfg->fc_type > RTN_MAX) {
2842 NL_SET_ERR_MSG(extack, "Invalid route type");
2846 if (cfg->fc_dst_len > 128) {
2847 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2850 if (cfg->fc_src_len > 128) {
2851 NL_SET_ERR_MSG(extack, "Invalid source address length");
2854 #ifndef CONFIG_IPV6_SUBTREES
2855 if (cfg->fc_src_len) {
2856 NL_SET_ERR_MSG(extack,
2857 "Specifying source address requires IPV6_SUBTREES to be enabled");
2861 if (cfg->fc_ifindex) {
2863 dev = dev_get_by_index(net, cfg->fc_ifindex);
2866 idev = in6_dev_get(dev);
2871 if (cfg->fc_metric == 0)
2872 cfg->fc_metric = IP6_RT_PRIO_USER;
2874 if (cfg->fc_flags & RTNH_F_ONLINK) {
2876 NL_SET_ERR_MSG(extack,
2877 "Nexthop device required for onlink");
2882 if (!(dev->flags & IFF_UP)) {
2883 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2890 if (cfg->fc_nlinfo.nlh &&
2891 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2892 table = fib6_get_table(net, cfg->fc_table);
2894 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2895 table = fib6_new_table(net, cfg->fc_table);
2898 table = fib6_new_table(net, cfg->fc_table);
2905 rt = fib6_info_alloc(gfp_flags);
2909 if (cfg->fc_flags & RTF_ADDRCONF)
2910 rt->dst_nocount = true;
2912 err = ip6_convert_metrics(net, rt, cfg);
2916 if (cfg->fc_flags & RTF_EXPIRES)
2917 fib6_set_expires(rt, jiffies +
2918 clock_t_to_jiffies(cfg->fc_expires));
2920 fib6_clean_expires(rt);
2922 if (cfg->fc_protocol == RTPROT_UNSPEC)
2923 cfg->fc_protocol = RTPROT_BOOT;
2924 rt->fib6_protocol = cfg->fc_protocol;
2926 addr_type = ipv6_addr_type(&cfg->fc_dst);
2928 if (cfg->fc_encap) {
2929 struct lwtunnel_state *lwtstate;
2931 err = lwtunnel_build_state(cfg->fc_encap_type,
2932 cfg->fc_encap, AF_INET6, cfg,
2936 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2939 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2940 rt->fib6_dst.plen = cfg->fc_dst_len;
2941 if (rt->fib6_dst.plen == 128)
2942 rt->dst_host = true;
2944 #ifdef CONFIG_IPV6_SUBTREES
2945 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2946 rt->fib6_src.plen = cfg->fc_src_len;
2949 rt->fib6_metric = cfg->fc_metric;
2950 rt->fib6_nh.nh_weight = 1;
2952 rt->fib6_type = cfg->fc_type;
2954 /* We cannot add true routes via loopback here,
2955 they would result in kernel looping; promote them to reject routes
2957 if ((cfg->fc_flags & RTF_REJECT) ||
2958 (dev && (dev->flags & IFF_LOOPBACK) &&
2959 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2960 !(cfg->fc_flags & RTF_LOCAL))) {
2961 /* hold loopback dev/idev if we haven't done so. */
2962 if (dev != net->loopback_dev) {
2967 dev = net->loopback_dev;
2969 idev = in6_dev_get(dev);
2975 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
2979 if (cfg->fc_flags & RTF_GATEWAY) {
2980 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2984 rt->fib6_nh.nh_gw = cfg->fc_gateway;
2991 if (idev->cnf.disable_ipv6) {
2992 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2997 if (!(dev->flags & IFF_UP)) {
2998 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3003 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3004 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3005 NL_SET_ERR_MSG(extack, "Invalid source address");
3009 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3010 rt->fib6_prefsrc.plen = 128;
3012 rt->fib6_prefsrc.plen = 0;
3014 rt->fib6_flags = cfg->fc_flags;
3017 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3018 !netif_carrier_ok(dev))
3019 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3020 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3021 rt->fib6_nh.nh_dev = dev;
3022 rt->fib6_table = table;
3024 cfg->fc_nlinfo.nl_net = dev_net(dev);
3036 fib6_info_release(rt);
3037 return ERR_PTR(err);
3040 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3041 struct netlink_ext_ack *extack)
3043 struct fib6_info *rt;
3046 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3050 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3051 fib6_info_release(rt);
3056 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3058 struct net *net = info->nl_net;
3059 struct fib6_table *table;
3062 if (rt == net->ipv6.fib6_null_entry) {
3067 table = rt->fib6_table;
3068 spin_lock_bh(&table->tb6_lock);
3069 err = fib6_del(rt, info);
3070 spin_unlock_bh(&table->tb6_lock);
3073 fib6_info_release(rt);
3077 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3079 struct nl_info info = { .nl_net = net };
3081 return __ip6_del_rt(rt, &info);
3084 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3086 struct nl_info *info = &cfg->fc_nlinfo;
3087 struct net *net = info->nl_net;
3088 struct sk_buff *skb = NULL;
3089 struct fib6_table *table;
3092 if (rt == net->ipv6.fib6_null_entry)
3094 table = rt->fib6_table;
3095 spin_lock_bh(&table->tb6_lock);
3097 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3098 struct fib6_info *sibling, *next_sibling;
3100 /* prefer to send a single notification with all hops */
3101 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3103 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3105 if (rt6_fill_node(net, skb, rt, NULL,
3106 NULL, NULL, 0, RTM_DELROUTE,
3107 info->portid, seq, 0) < 0) {
3111 info->skip_notify = 1;
3114 list_for_each_entry_safe(sibling, next_sibling,
3117 err = fib6_del(sibling, info);
3123 err = fib6_del(rt, info);
3125 spin_unlock_bh(&table->tb6_lock);
3127 fib6_info_release(rt);
3130 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3131 info->nlh, gfp_any());
3136 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3140 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3143 if (cfg->fc_flags & RTF_GATEWAY &&
3144 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3146 if (dst_hold_safe(&rt->dst))
3147 rc = rt6_remove_exception_rt(rt);
3152 static int ip6_route_del(struct fib6_config *cfg,
3153 struct netlink_ext_ack *extack)
3155 struct rt6_info *rt_cache;
3156 struct fib6_table *table;
3157 struct fib6_info *rt;
3158 struct fib6_node *fn;
3161 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3163 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3169 fn = fib6_locate(&table->tb6_root,
3170 &cfg->fc_dst, cfg->fc_dst_len,
3171 &cfg->fc_src, cfg->fc_src_len,
3172 !(cfg->fc_flags & RTF_CACHE));
3175 for_each_fib6_node_rt_rcu(fn) {
3176 if (cfg->fc_flags & RTF_CACHE) {
3179 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3182 rc = ip6_del_cached_rt(rt_cache, cfg);
3188 if (cfg->fc_ifindex &&
3189 (!rt->fib6_nh.nh_dev ||
3190 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3192 if (cfg->fc_flags & RTF_GATEWAY &&
3193 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3195 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3197 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3202 /* if gateway was specified only delete the one hop */
3203 if (cfg->fc_flags & RTF_GATEWAY)
3204 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3206 return __ip6_del_rt_siblings(rt, cfg);
3214 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3216 struct netevent_redirect netevent;
3217 struct rt6_info *rt, *nrt = NULL;
3218 struct ndisc_options ndopts;
3219 struct inet6_dev *in6_dev;
3220 struct neighbour *neigh;
3222 int optlen, on_link;
3225 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3226 optlen -= sizeof(*msg);
3229 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3233 msg = (struct rd_msg *)icmp6_hdr(skb);
3235 if (ipv6_addr_is_multicast(&msg->dest)) {
3236 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3241 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3243 } else if (ipv6_addr_type(&msg->target) !=
3244 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3245 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3249 in6_dev = __in6_dev_get(skb->dev);
3252 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3256 * The IP source address of the Redirect MUST be the same as the current
3257 * first-hop router for the specified ICMP Destination Address.
3260 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3261 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3266 if (ndopts.nd_opts_tgt_lladdr) {
3267 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3270 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3275 rt = (struct rt6_info *) dst;
3276 if (rt->rt6i_flags & RTF_REJECT) {
3277 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3281 /* Redirect received -> path was valid.
3282 * Look, redirects are sent only in response to data packets,
3283 * so that this nexthop apparently is reachable. --ANK
3285 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3287 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3292 * We have finally decided to accept it.
3295 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3296 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3297 NEIGH_UPDATE_F_OVERRIDE|
3298 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3299 NEIGH_UPDATE_F_ISROUTER)),
3300 NDISC_REDIRECT, &ndopts);
3302 nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
3306 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3308 nrt->rt6i_flags &= ~RTF_GATEWAY;
3310 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3312 /* No need to remove rt from the exception table if rt is
3313 * a cached route because rt6_insert_exception() will
3316 if (rt6_insert_exception(nrt, rt->from)) {
3317 dst_release_immediate(&nrt->dst);
3321 netevent.old = &rt->dst;
3322 netevent.new = &nrt->dst;
3323 netevent.daddr = &msg->dest;
3324 netevent.neigh = neigh;
3325 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3328 neigh_release(neigh);
3331 #ifdef CONFIG_IPV6_ROUTE_INFO
3332 static struct fib6_info *rt6_get_route_info(struct net *net,
3333 const struct in6_addr *prefix, int prefixlen,
3334 const struct in6_addr *gwaddr,
3335 struct net_device *dev)
3337 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3338 int ifindex = dev->ifindex;
3339 struct fib6_node *fn;
3340 struct fib6_info *rt = NULL;
3341 struct fib6_table *table;
3343 table = fib6_get_table(net, tb_id);
3348 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3352 for_each_fib6_node_rt_rcu(fn) {
3353 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3355 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3357 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3367 static struct fib6_info *rt6_add_route_info(struct net *net,
3368 const struct in6_addr *prefix, int prefixlen,
3369 const struct in6_addr *gwaddr,
3370 struct net_device *dev,
3373 struct fib6_config cfg = {
3374 .fc_metric = IP6_RT_PRIO_USER,
3375 .fc_ifindex = dev->ifindex,
3376 .fc_dst_len = prefixlen,
3377 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3378 RTF_UP | RTF_PREF(pref),
3379 .fc_protocol = RTPROT_RA,
3380 .fc_type = RTN_UNICAST,
3381 .fc_nlinfo.portid = 0,
3382 .fc_nlinfo.nlh = NULL,
3383 .fc_nlinfo.nl_net = net,
3386 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3387 cfg.fc_dst = *prefix;
3388 cfg.fc_gateway = *gwaddr;
3390 /* We should treat it as a default route if prefix length is 0. */
3392 cfg.fc_flags |= RTF_DEFAULT;
3394 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3396 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3400 struct fib6_info *rt6_get_dflt_router(struct net *net,
3401 const struct in6_addr *addr,
3402 struct net_device *dev)
3404 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3405 struct fib6_info *rt;
3406 struct fib6_table *table;
3408 table = fib6_get_table(net, tb_id);
3413 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3414 if (dev == rt->fib6_nh.nh_dev &&
3415 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3416 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3425 struct fib6_info *rt6_add_dflt_router(struct net *net,
3426 const struct in6_addr *gwaddr,
3427 struct net_device *dev,
3430 struct fib6_config cfg = {
3431 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3432 .fc_metric = IP6_RT_PRIO_USER,
3433 .fc_ifindex = dev->ifindex,
3434 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3435 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3436 .fc_protocol = RTPROT_RA,
3437 .fc_type = RTN_UNICAST,
3438 .fc_nlinfo.portid = 0,
3439 .fc_nlinfo.nlh = NULL,
3440 .fc_nlinfo.nl_net = net,
3443 cfg.fc_gateway = *gwaddr;
3445 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3446 struct fib6_table *table;
3448 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3450 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3453 return rt6_get_dflt_router(net, gwaddr, dev);
3456 static void __rt6_purge_dflt_routers(struct net *net,
3457 struct fib6_table *table)
3459 struct fib6_info *rt;
3463 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3464 struct net_device *dev = fib6_info_nh_dev(rt);
3465 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3467 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3468 (!idev || idev->cnf.accept_ra != 2)) {
3471 ip6_del_rt(net, rt);
3477 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3480 void rt6_purge_dflt_routers(struct net *net)
3482 struct fib6_table *table;
3483 struct hlist_head *head;
3488 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3489 head = &net->ipv6.fib_table_hash[h];
3490 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3491 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3492 __rt6_purge_dflt_routers(net, table);
3499 static void rtmsg_to_fib6_config(struct net *net,
3500 struct in6_rtmsg *rtmsg,
3501 struct fib6_config *cfg)
3503 memset(cfg, 0, sizeof(*cfg));
3505 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3507 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3508 cfg->fc_metric = rtmsg->rtmsg_metric;
3509 cfg->fc_expires = rtmsg->rtmsg_info;
3510 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3511 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3512 cfg->fc_flags = rtmsg->rtmsg_flags;
3513 cfg->fc_type = rtmsg->rtmsg_type;
3515 cfg->fc_nlinfo.nl_net = net;
3517 cfg->fc_dst = rtmsg->rtmsg_dst;
3518 cfg->fc_src = rtmsg->rtmsg_src;
3519 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3522 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3524 struct fib6_config cfg;
3525 struct in6_rtmsg rtmsg;
3529 case SIOCADDRT: /* Add a route */
3530 case SIOCDELRT: /* Delete a route */
3531 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3533 err = copy_from_user(&rtmsg, arg,
3534 sizeof(struct in6_rtmsg));
3538 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3543 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3546 err = ip6_route_del(&cfg, NULL);
3560 * Drop the packet on the floor
3563 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3566 struct dst_entry *dst = skb_dst(skb);
3567 switch (ipstats_mib_noroutes) {
3568 case IPSTATS_MIB_INNOROUTES:
3569 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3570 if (type == IPV6_ADDR_ANY) {
3571 IP6_INC_STATS(dev_net(dst->dev),
3572 __in6_dev_get_safely(skb->dev),
3573 IPSTATS_MIB_INADDRERRORS);
3577 case IPSTATS_MIB_OUTNOROUTES:
3578 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3579 ipstats_mib_noroutes);
3582 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3587 static int ip6_pkt_discard(struct sk_buff *skb)
3589 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3592 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3594 skb->dev = skb_dst(skb)->dev;
3595 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3598 static int ip6_pkt_prohibit(struct sk_buff *skb)
3600 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3603 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3605 skb->dev = skb_dst(skb)->dev;
3606 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3610 * Allocate a dst for local (unicast / anycast) address.
3613 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3614 struct inet6_dev *idev,
3615 const struct in6_addr *addr,
3616 bool anycast, gfp_t gfp_flags)
3619 struct net_device *dev = idev->dev;
3620 struct fib6_info *f6i;
3622 f6i = fib6_info_alloc(gfp_flags);
3624 return ERR_PTR(-ENOMEM);
3626 f6i->dst_nocount = true;
3627 f6i->dst_host = true;
3628 f6i->fib6_protocol = RTPROT_KERNEL;
3629 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3631 f6i->fib6_type = RTN_ANYCAST;
3632 f6i->fib6_flags |= RTF_ANYCAST;
3634 f6i->fib6_type = RTN_LOCAL;
3635 f6i->fib6_flags |= RTF_LOCAL;
3638 f6i->fib6_nh.nh_gw = *addr;
3640 f6i->fib6_nh.nh_dev = dev;
3641 f6i->fib6_dst.addr = *addr;
3642 f6i->fib6_dst.plen = 128;
3643 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3644 f6i->fib6_table = fib6_get_table(net, tb_id);
3649 /* remove deleted ip from prefsrc entries */
3650 struct arg_dev_net_ip {
3651 struct net_device *dev;
3653 struct in6_addr *addr;
3656 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3658 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3659 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3660 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3662 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3663 rt != net->ipv6.fib6_null_entry &&
3664 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3665 spin_lock_bh(&rt6_exception_lock);
3666 /* remove prefsrc entry */
3667 rt->fib6_prefsrc.plen = 0;
3668 /* need to update cache as well */
3669 rt6_exceptions_remove_prefsrc(rt);
3670 spin_unlock_bh(&rt6_exception_lock);
3675 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3677 struct net *net = dev_net(ifp->idev->dev);
3678 struct arg_dev_net_ip adni = {
3679 .dev = ifp->idev->dev,
3683 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3686 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3688 /* Remove routers and update dst entries when gateway turn into host. */
3689 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3691 struct in6_addr *gateway = (struct in6_addr *)arg;
3693 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3694 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3698 /* Further clean up cached routes in exception table.
3699 * This is needed because cached route may have a different
3700 * gateway than its 'parent' in the case of an ip redirect.
3702 rt6_exceptions_clean_tohost(rt, gateway);
3707 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3709 fib6_clean_all(net, fib6_clean_tohost, gateway);
3712 struct arg_netdev_event {
3713 const struct net_device *dev;
3715 unsigned int nh_flags;
3716 unsigned long event;
3720 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3722 struct fib6_info *iter;
3723 struct fib6_node *fn;
3725 fn = rcu_dereference_protected(rt->fib6_node,
3726 lockdep_is_held(&rt->fib6_table->tb6_lock));
3727 iter = rcu_dereference_protected(fn->leaf,
3728 lockdep_is_held(&rt->fib6_table->tb6_lock));
3730 if (iter->fib6_metric == rt->fib6_metric &&
3731 rt6_qualify_for_ecmp(iter))
3733 iter = rcu_dereference_protected(iter->rt6_next,
3734 lockdep_is_held(&rt->fib6_table->tb6_lock));
3740 static bool rt6_is_dead(const struct fib6_info *rt)
3742 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3743 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3744 fib6_ignore_linkdown(rt)))
3750 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3752 struct fib6_info *iter;
3755 if (!rt6_is_dead(rt))
3756 total += rt->fib6_nh.nh_weight;
3758 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3759 if (!rt6_is_dead(iter))
3760 total += iter->fib6_nh.nh_weight;
3766 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3768 int upper_bound = -1;
3770 if (!rt6_is_dead(rt)) {
3771 *weight += rt->fib6_nh.nh_weight;
3772 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3775 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3778 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3780 struct fib6_info *iter;
3783 rt6_upper_bound_set(rt, &weight, total);
3785 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3786 rt6_upper_bound_set(iter, &weight, total);
3789 void rt6_multipath_rebalance(struct fib6_info *rt)
3791 struct fib6_info *first;
3794 /* In case the entire multipath route was marked for flushing,
3795 * then there is no need to rebalance upon the removal of every
3798 if (!rt->fib6_nsiblings || rt->should_flush)
3801 /* During lookup routes are evaluated in order, so we need to
3802 * make sure upper bounds are assigned from the first sibling
3805 first = rt6_multipath_first_sibling(rt);
3806 if (WARN_ON_ONCE(!first))
3809 total = rt6_multipath_total_weight(first);
3810 rt6_multipath_upper_bound_set(first, total);
3813 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3815 const struct arg_netdev_event *arg = p_arg;
3816 struct net *net = dev_net(arg->dev);
3818 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3819 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3820 fib6_update_sernum_upto_root(net, rt);
3821 rt6_multipath_rebalance(rt);
3827 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3829 struct arg_netdev_event arg = {
3832 .nh_flags = nh_flags,
3836 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3837 arg.nh_flags |= RTNH_F_LINKDOWN;
3839 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3842 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3843 const struct net_device *dev)
3845 struct fib6_info *iter;
3847 if (rt->fib6_nh.nh_dev == dev)
3849 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3850 if (iter->fib6_nh.nh_dev == dev)
3856 static void rt6_multipath_flush(struct fib6_info *rt)
3858 struct fib6_info *iter;
3860 rt->should_flush = 1;
3861 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3862 iter->should_flush = 1;
3865 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3866 const struct net_device *down_dev)
3868 struct fib6_info *iter;
3869 unsigned int dead = 0;
3871 if (rt->fib6_nh.nh_dev == down_dev ||
3872 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3874 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3875 if (iter->fib6_nh.nh_dev == down_dev ||
3876 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3882 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3883 const struct net_device *dev,
3884 unsigned int nh_flags)
3886 struct fib6_info *iter;
3888 if (rt->fib6_nh.nh_dev == dev)
3889 rt->fib6_nh.nh_flags |= nh_flags;
3890 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3891 if (iter->fib6_nh.nh_dev == dev)
3892 iter->fib6_nh.nh_flags |= nh_flags;
3895 /* called with write lock held for table with rt */
3896 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3898 const struct arg_netdev_event *arg = p_arg;
3899 const struct net_device *dev = arg->dev;
3900 struct net *net = dev_net(dev);
3902 if (rt == net->ipv6.fib6_null_entry)
3905 switch (arg->event) {
3906 case NETDEV_UNREGISTER:
3907 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3909 if (rt->should_flush)
3911 if (!rt->fib6_nsiblings)
3912 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3913 if (rt6_multipath_uses_dev(rt, dev)) {
3916 count = rt6_multipath_dead_count(rt, dev);
3917 if (rt->fib6_nsiblings + 1 == count) {
3918 rt6_multipath_flush(rt);
3921 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3923 fib6_update_sernum(net, rt);
3924 rt6_multipath_rebalance(rt);
3928 if (rt->fib6_nh.nh_dev != dev ||
3929 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3931 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3932 rt6_multipath_rebalance(rt);
3939 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3941 struct arg_netdev_event arg = {
3948 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3951 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3953 rt6_sync_down_dev(dev, event);
3954 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3955 neigh_ifdown(&nd_tbl, dev);
3958 struct rt6_mtu_change_arg {
3959 struct net_device *dev;
3963 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
3965 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3966 struct inet6_dev *idev;
3968 /* In IPv6 pmtu discovery is not optional,
3969 so that RTAX_MTU lock cannot disable it.
3970 We still use this lock to block changes
3971 caused by addrconf/ndisc.
3974 idev = __in6_dev_get(arg->dev);
3978 /* For administrative MTU increase, there is no way to discover
3979 IPv6 PMTU increase, so PMTU increase should be updated here.
3980 Since RFC 1981 doesn't include administrative MTU increase
3981 update PMTU increase is a MUST. (i.e. jumbo frame)
3983 if (rt->fib6_nh.nh_dev == arg->dev &&
3984 !fib6_metric_locked(rt, RTAX_MTU)) {
3985 u32 mtu = rt->fib6_pmtu;
3987 if (mtu >= arg->mtu ||
3988 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
3989 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
3991 spin_lock_bh(&rt6_exception_lock);
3992 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3993 spin_unlock_bh(&rt6_exception_lock);
3998 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4000 struct rt6_mtu_change_arg arg = {
4005 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4008 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4009 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4010 [RTA_OIF] = { .type = NLA_U32 },
4011 [RTA_IIF] = { .type = NLA_U32 },
4012 [RTA_PRIORITY] = { .type = NLA_U32 },
4013 [RTA_METRICS] = { .type = NLA_NESTED },
4014 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4015 [RTA_PREF] = { .type = NLA_U8 },
4016 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4017 [RTA_ENCAP] = { .type = NLA_NESTED },
4018 [RTA_EXPIRES] = { .type = NLA_U32 },
4019 [RTA_UID] = { .type = NLA_U32 },
4020 [RTA_MARK] = { .type = NLA_U32 },
4023 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4024 struct fib6_config *cfg,
4025 struct netlink_ext_ack *extack)
4028 struct nlattr *tb[RTA_MAX+1];
4032 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4038 rtm = nlmsg_data(nlh);
4039 memset(cfg, 0, sizeof(*cfg));
4041 cfg->fc_table = rtm->rtm_table;
4042 cfg->fc_dst_len = rtm->rtm_dst_len;
4043 cfg->fc_src_len = rtm->rtm_src_len;
4044 cfg->fc_flags = RTF_UP;
4045 cfg->fc_protocol = rtm->rtm_protocol;
4046 cfg->fc_type = rtm->rtm_type;
4048 if (rtm->rtm_type == RTN_UNREACHABLE ||
4049 rtm->rtm_type == RTN_BLACKHOLE ||
4050 rtm->rtm_type == RTN_PROHIBIT ||
4051 rtm->rtm_type == RTN_THROW)
4052 cfg->fc_flags |= RTF_REJECT;
4054 if (rtm->rtm_type == RTN_LOCAL)
4055 cfg->fc_flags |= RTF_LOCAL;
4057 if (rtm->rtm_flags & RTM_F_CLONED)
4058 cfg->fc_flags |= RTF_CACHE;
4060 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4062 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4063 cfg->fc_nlinfo.nlh = nlh;
4064 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4066 if (tb[RTA_GATEWAY]) {
4067 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4068 cfg->fc_flags |= RTF_GATEWAY;
4072 int plen = (rtm->rtm_dst_len + 7) >> 3;
4074 if (nla_len(tb[RTA_DST]) < plen)
4077 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4081 int plen = (rtm->rtm_src_len + 7) >> 3;
4083 if (nla_len(tb[RTA_SRC]) < plen)
4086 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4089 if (tb[RTA_PREFSRC])
4090 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4093 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4095 if (tb[RTA_PRIORITY])
4096 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4098 if (tb[RTA_METRICS]) {
4099 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4100 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4104 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4106 if (tb[RTA_MULTIPATH]) {
4107 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4108 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4110 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4111 cfg->fc_mp_len, extack);
4117 pref = nla_get_u8(tb[RTA_PREF]);
4118 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4119 pref != ICMPV6_ROUTER_PREF_HIGH)
4120 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4121 cfg->fc_flags |= RTF_PREF(pref);
4125 cfg->fc_encap = tb[RTA_ENCAP];
4127 if (tb[RTA_ENCAP_TYPE]) {
4128 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4130 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4135 if (tb[RTA_EXPIRES]) {
4136 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4138 if (addrconf_finite_timeout(timeout)) {
4139 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4140 cfg->fc_flags |= RTF_EXPIRES;
4150 struct fib6_info *fib6_info;
4151 struct fib6_config r_cfg;
4152 struct list_head next;
4155 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4159 list_for_each_entry(nh, rt6_nh_list, next) {
4160 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4161 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4162 nh->r_cfg.fc_ifindex);
4166 static int ip6_route_info_append(struct net *net,
4167 struct list_head *rt6_nh_list,
4168 struct fib6_info *rt,
4169 struct fib6_config *r_cfg)
4174 list_for_each_entry(nh, rt6_nh_list, next) {
4175 /* check if fib6_info already exists */
4176 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4180 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4184 err = ip6_convert_metrics(net, rt, r_cfg);
4189 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4190 list_add_tail(&nh->next, rt6_nh_list);
4195 static void ip6_route_mpath_notify(struct fib6_info *rt,
4196 struct fib6_info *rt_last,
4197 struct nl_info *info,
4200 /* if this is an APPEND route, then rt points to the first route
4201 * inserted and rt_last points to last route inserted. Userspace
4202 * wants a consistent dump of the route which starts at the first
4203 * nexthop. Since sibling routes are always added at the end of
4204 * the list, find the first sibling of the last route appended
4206 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4207 rt = list_first_entry(&rt_last->fib6_siblings,
4213 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4216 static int ip6_route_multipath_add(struct fib6_config *cfg,
4217 struct netlink_ext_ack *extack)
4219 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4220 struct nl_info *info = &cfg->fc_nlinfo;
4221 struct fib6_config r_cfg;
4222 struct rtnexthop *rtnh;
4223 struct fib6_info *rt;
4224 struct rt6_nh *err_nh;
4225 struct rt6_nh *nh, *nh_safe;
4231 int replace = (cfg->fc_nlinfo.nlh &&
4232 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4233 LIST_HEAD(rt6_nh_list);
4235 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4236 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4237 nlflags |= NLM_F_APPEND;
4239 remaining = cfg->fc_mp_len;
4240 rtnh = (struct rtnexthop *)cfg->fc_mp;
4242 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4243 * fib6_info structs per nexthop
4245 while (rtnh_ok(rtnh, remaining)) {
4246 memcpy(&r_cfg, cfg, sizeof(*cfg));
4247 if (rtnh->rtnh_ifindex)
4248 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4250 attrlen = rtnh_attrlen(rtnh);
4252 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4254 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4256 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4257 r_cfg.fc_flags |= RTF_GATEWAY;
4259 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4260 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4262 r_cfg.fc_encap_type = nla_get_u16(nla);
4265 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4266 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4273 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4275 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4278 fib6_info_release(rt);
4282 rtnh = rtnh_next(rtnh, &remaining);
4285 /* for add and replace send one notification with all nexthops.
4286 * Skip the notification in fib6_add_rt2node and send one with
4287 * the full route when done
4289 info->skip_notify = 1;
4292 list_for_each_entry(nh, &rt6_nh_list, next) {
4293 rt_last = nh->fib6_info;
4294 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4295 fib6_info_release(nh->fib6_info);
4297 /* save reference to first route for notification */
4298 if (!rt_notif && !err)
4299 rt_notif = nh->fib6_info;
4301 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4302 nh->fib6_info = NULL;
4305 ip6_print_replace_route_err(&rt6_nh_list);
4310 /* Because each route is added like a single route we remove
4311 * these flags after the first nexthop: if there is a collision,
4312 * we have already failed to add the first nexthop:
4313 * fib6_add_rt2node() has rejected it; when replacing, old
4314 * nexthops have been replaced by first new, the rest should
4317 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4322 /* success ... tell user about new route */
4323 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4327 /* send notification for routes that were added so that
4328 * the delete notifications sent by ip6_route_del are
4332 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4334 /* Delete routes that were already added */
4335 list_for_each_entry(nh, &rt6_nh_list, next) {
4338 ip6_route_del(&nh->r_cfg, extack);
4342 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4344 fib6_info_release(nh->fib6_info);
4345 list_del(&nh->next);
4352 static int ip6_route_multipath_del(struct fib6_config *cfg,
4353 struct netlink_ext_ack *extack)
4355 struct fib6_config r_cfg;
4356 struct rtnexthop *rtnh;
4359 int err = 1, last_err = 0;
4361 remaining = cfg->fc_mp_len;
4362 rtnh = (struct rtnexthop *)cfg->fc_mp;
4364 /* Parse a Multipath Entry */
4365 while (rtnh_ok(rtnh, remaining)) {
4366 memcpy(&r_cfg, cfg, sizeof(*cfg));
4367 if (rtnh->rtnh_ifindex)
4368 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4370 attrlen = rtnh_attrlen(rtnh);
4372 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4374 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4376 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4377 r_cfg.fc_flags |= RTF_GATEWAY;
4380 err = ip6_route_del(&r_cfg, extack);
4384 rtnh = rtnh_next(rtnh, &remaining);
4390 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4391 struct netlink_ext_ack *extack)
4393 struct fib6_config cfg;
4396 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4401 return ip6_route_multipath_del(&cfg, extack);
4403 cfg.fc_delete_all_nh = 1;
4404 return ip6_route_del(&cfg, extack);
4408 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4409 struct netlink_ext_ack *extack)
4411 struct fib6_config cfg;
4414 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4419 return ip6_route_multipath_add(&cfg, extack);
4421 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4424 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4426 int nexthop_len = 0;
4428 if (rt->fib6_nsiblings) {
4429 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4430 + NLA_ALIGN(sizeof(struct rtnexthop))
4431 + nla_total_size(16) /* RTA_GATEWAY */
4432 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4434 nexthop_len *= rt->fib6_nsiblings;
4437 return NLMSG_ALIGN(sizeof(struct rtmsg))
4438 + nla_total_size(16) /* RTA_SRC */
4439 + nla_total_size(16) /* RTA_DST */
4440 + nla_total_size(16) /* RTA_GATEWAY */
4441 + nla_total_size(16) /* RTA_PREFSRC */
4442 + nla_total_size(4) /* RTA_TABLE */
4443 + nla_total_size(4) /* RTA_IIF */
4444 + nla_total_size(4) /* RTA_OIF */
4445 + nla_total_size(4) /* RTA_PRIORITY */
4446 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4447 + nla_total_size(sizeof(struct rta_cacheinfo))
4448 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4449 + nla_total_size(1) /* RTA_PREF */
4450 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4454 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4455 unsigned int *flags, bool skip_oif)
4457 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4458 *flags |= RTNH_F_DEAD;
4460 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4461 *flags |= RTNH_F_LINKDOWN;
4464 if (fib6_ignore_linkdown(rt))
4465 *flags |= RTNH_F_DEAD;
4469 if (rt->fib6_flags & RTF_GATEWAY) {
4470 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4471 goto nla_put_failure;
4474 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4475 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4476 *flags |= RTNH_F_OFFLOAD;
4478 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4479 if (!skip_oif && rt->fib6_nh.nh_dev &&
4480 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4481 goto nla_put_failure;
4483 if (rt->fib6_nh.nh_lwtstate &&
4484 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4485 goto nla_put_failure;
4493 /* add multipath next hop */
4494 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4496 const struct net_device *dev = rt->fib6_nh.nh_dev;
4497 struct rtnexthop *rtnh;
4498 unsigned int flags = 0;
4500 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4502 goto nla_put_failure;
4504 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4505 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4507 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4508 goto nla_put_failure;
4510 rtnh->rtnh_flags = flags;
4512 /* length of rtnetlink header + attributes */
4513 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4521 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4522 struct fib6_info *rt, struct dst_entry *dst,
4523 struct in6_addr *dest, struct in6_addr *src,
4524 int iif, int type, u32 portid, u32 seq,
4528 struct nlmsghdr *nlh;
4533 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4537 rtm = nlmsg_data(nlh);
4538 rtm->rtm_family = AF_INET6;
4539 rtm->rtm_dst_len = rt->fib6_dst.plen;
4540 rtm->rtm_src_len = rt->fib6_src.plen;
4543 table = rt->fib6_table->tb6_id;
4545 table = RT6_TABLE_UNSPEC;
4546 rtm->rtm_table = table;
4547 if (nla_put_u32(skb, RTA_TABLE, table))
4548 goto nla_put_failure;
4550 rtm->rtm_type = rt->fib6_type;
4552 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4553 rtm->rtm_protocol = rt->fib6_protocol;
4555 if (rt->fib6_flags & RTF_CACHE)
4556 rtm->rtm_flags |= RTM_F_CLONED;
4559 if (nla_put_in6_addr(skb, RTA_DST, dest))
4560 goto nla_put_failure;
4561 rtm->rtm_dst_len = 128;
4562 } else if (rtm->rtm_dst_len)
4563 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4564 goto nla_put_failure;
4565 #ifdef CONFIG_IPV6_SUBTREES
4567 if (nla_put_in6_addr(skb, RTA_SRC, src))
4568 goto nla_put_failure;
4569 rtm->rtm_src_len = 128;
4570 } else if (rtm->rtm_src_len &&
4571 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4572 goto nla_put_failure;
4575 #ifdef CONFIG_IPV6_MROUTE
4576 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4577 int err = ip6mr_get_route(net, skb, rtm, portid);
4582 goto nla_put_failure;
4585 if (nla_put_u32(skb, RTA_IIF, iif))
4586 goto nla_put_failure;
4588 struct in6_addr saddr_buf;
4589 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4590 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4591 goto nla_put_failure;
4594 if (rt->fib6_prefsrc.plen) {
4595 struct in6_addr saddr_buf;
4596 saddr_buf = rt->fib6_prefsrc.addr;
4597 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4598 goto nla_put_failure;
4601 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4602 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4603 goto nla_put_failure;
4605 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4606 goto nla_put_failure;
4608 /* For multipath routes, walk the siblings list and add
4609 * each as a nexthop within RTA_MULTIPATH.
4611 if (rt->fib6_nsiblings) {
4612 struct fib6_info *sibling, *next_sibling;
4615 mp = nla_nest_start(skb, RTA_MULTIPATH);
4617 goto nla_put_failure;
4619 if (rt6_add_nexthop(skb, rt) < 0)
4620 goto nla_put_failure;
4622 list_for_each_entry_safe(sibling, next_sibling,
4623 &rt->fib6_siblings, fib6_siblings) {
4624 if (rt6_add_nexthop(skb, sibling) < 0)
4625 goto nla_put_failure;
4628 nla_nest_end(skb, mp);
4630 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4631 goto nla_put_failure;
4634 if (rt->fib6_flags & RTF_EXPIRES) {
4635 expires = dst ? dst->expires : rt->expires;
4639 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4640 goto nla_put_failure;
4642 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4643 goto nla_put_failure;
4646 nlmsg_end(skb, nlh);
4650 nlmsg_cancel(skb, nlh);
4654 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4656 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4657 struct net *net = arg->net;
4659 if (rt == net->ipv6.fib6_null_entry)
4662 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4663 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4665 /* user wants prefix routes only */
4666 if (rtm->rtm_flags & RTM_F_PREFIX &&
4667 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4668 /* success since this is not a prefix route */
4673 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4674 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4675 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4678 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4679 struct netlink_ext_ack *extack)
4681 struct net *net = sock_net(in_skb->sk);
4682 struct nlattr *tb[RTA_MAX+1];
4683 int err, iif = 0, oif = 0;
4684 struct dst_entry *dst;
4685 struct rt6_info *rt;
4686 struct sk_buff *skb;
4691 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4697 memset(&fl6, 0, sizeof(fl6));
4698 rtm = nlmsg_data(nlh);
4699 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4700 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4703 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4706 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4710 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4713 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4717 iif = nla_get_u32(tb[RTA_IIF]);
4720 oif = nla_get_u32(tb[RTA_OIF]);
4723 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4726 fl6.flowi6_uid = make_kuid(current_user_ns(),
4727 nla_get_u32(tb[RTA_UID]));
4729 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4732 struct net_device *dev;
4737 dev = dev_get_by_index_rcu(net, iif);
4744 fl6.flowi6_iif = iif;
4746 if (!ipv6_addr_any(&fl6.saddr))
4747 flags |= RT6_LOOKUP_F_HAS_SADDR;
4749 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4753 fl6.flowi6_oif = oif;
4755 dst = ip6_route_output(net, NULL, &fl6);
4759 rt = container_of(dst, struct rt6_info, dst);
4760 if (rt->dst.error) {
4761 err = rt->dst.error;
4766 if (rt == net->ipv6.ip6_null_entry) {
4767 err = rt->dst.error;
4772 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4779 skb_dst_set(skb, &rt->dst);
4781 err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
4782 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4785 err = rt6_fill_node(net, skb, rt->from, dst,
4786 &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
4787 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4794 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4799 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4800 unsigned int nlm_flags)
4802 struct sk_buff *skb;
4803 struct net *net = info->nl_net;
4808 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4810 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4814 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4815 event, info->portid, seq, nlm_flags);
4817 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4818 WARN_ON(err == -EMSGSIZE);
4822 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4823 info->nlh, gfp_any());
4827 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4830 static int ip6_route_dev_notify(struct notifier_block *this,
4831 unsigned long event, void *ptr)
4833 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4834 struct net *net = dev_net(dev);
4836 if (!(dev->flags & IFF_LOOPBACK))
4839 if (event == NETDEV_REGISTER) {
4840 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4841 net->ipv6.ip6_null_entry->dst.dev = dev;
4842 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4843 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4844 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4845 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4846 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4847 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4849 } else if (event == NETDEV_UNREGISTER &&
4850 dev->reg_state != NETREG_UNREGISTERED) {
4851 /* NETDEV_UNREGISTER could be fired for multiple times by
4852 * netdev_wait_allrefs(). Make sure we only call this once.
4854 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4855 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4856 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4857 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4868 #ifdef CONFIG_PROC_FS
4870 static const struct file_operations ipv6_route_proc_fops = {
4871 .open = ipv6_route_open,
4873 .llseek = seq_lseek,
4874 .release = seq_release_net,
4877 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4879 struct net *net = (struct net *)seq->private;
4880 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4881 net->ipv6.rt6_stats->fib_nodes,
4882 net->ipv6.rt6_stats->fib_route_nodes,
4883 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4884 net->ipv6.rt6_stats->fib_rt_entries,
4885 net->ipv6.rt6_stats->fib_rt_cache,
4886 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4887 net->ipv6.rt6_stats->fib_discarded_routes);
4892 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4894 return single_open_net(inode, file, rt6_stats_seq_show);
4897 static const struct file_operations rt6_stats_seq_fops = {
4898 .open = rt6_stats_seq_open,
4900 .llseek = seq_lseek,
4901 .release = single_release_net,
4903 #endif /* CONFIG_PROC_FS */
4905 #ifdef CONFIG_SYSCTL
4908 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4909 void __user *buffer, size_t *lenp, loff_t *ppos)
4916 net = (struct net *)ctl->extra1;
4917 delay = net->ipv6.sysctl.flush_delay;
4918 proc_dointvec(ctl, write, buffer, lenp, ppos);
4919 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4923 struct ctl_table ipv6_route_table_template[] = {
4925 .procname = "flush",
4926 .data = &init_net.ipv6.sysctl.flush_delay,
4927 .maxlen = sizeof(int),
4929 .proc_handler = ipv6_sysctl_rtcache_flush
4932 .procname = "gc_thresh",
4933 .data = &ip6_dst_ops_template.gc_thresh,
4934 .maxlen = sizeof(int),
4936 .proc_handler = proc_dointvec,
4939 .procname = "max_size",
4940 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4941 .maxlen = sizeof(int),
4943 .proc_handler = proc_dointvec,
4946 .procname = "gc_min_interval",
4947 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4948 .maxlen = sizeof(int),
4950 .proc_handler = proc_dointvec_jiffies,
4953 .procname = "gc_timeout",
4954 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4955 .maxlen = sizeof(int),
4957 .proc_handler = proc_dointvec_jiffies,
4960 .procname = "gc_interval",
4961 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4962 .maxlen = sizeof(int),
4964 .proc_handler = proc_dointvec_jiffies,
4967 .procname = "gc_elasticity",
4968 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4969 .maxlen = sizeof(int),
4971 .proc_handler = proc_dointvec,
4974 .procname = "mtu_expires",
4975 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4976 .maxlen = sizeof(int),
4978 .proc_handler = proc_dointvec_jiffies,
4981 .procname = "min_adv_mss",
4982 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4983 .maxlen = sizeof(int),
4985 .proc_handler = proc_dointvec,
4988 .procname = "gc_min_interval_ms",
4989 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4990 .maxlen = sizeof(int),
4992 .proc_handler = proc_dointvec_ms_jiffies,
4997 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4999 struct ctl_table *table;
5001 table = kmemdup(ipv6_route_table_template,
5002 sizeof(ipv6_route_table_template),
5006 table[0].data = &net->ipv6.sysctl.flush_delay;
5007 table[0].extra1 = net;
5008 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5009 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5010 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5011 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5012 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5013 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5014 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5015 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5016 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5018 /* Don't export sysctls to unprivileged users */
5019 if (net->user_ns != &init_user_ns)
5020 table[0].procname = NULL;
5027 static int __net_init ip6_route_net_init(struct net *net)
5031 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5032 sizeof(net->ipv6.ip6_dst_ops));
5034 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5035 goto out_ip6_dst_ops;
5037 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5038 sizeof(*net->ipv6.fib6_null_entry),
5040 if (!net->ipv6.fib6_null_entry)
5041 goto out_ip6_dst_entries;
5043 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5044 sizeof(*net->ipv6.ip6_null_entry),
5046 if (!net->ipv6.ip6_null_entry)
5047 goto out_fib6_null_entry;
5048 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5049 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5050 ip6_template_metrics, true);
5052 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5053 net->ipv6.fib6_has_custom_rules = false;
5054 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5055 sizeof(*net->ipv6.ip6_prohibit_entry),
5057 if (!net->ipv6.ip6_prohibit_entry)
5058 goto out_ip6_null_entry;
5059 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5060 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5061 ip6_template_metrics, true);
5063 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5064 sizeof(*net->ipv6.ip6_blk_hole_entry),
5066 if (!net->ipv6.ip6_blk_hole_entry)
5067 goto out_ip6_prohibit_entry;
5068 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5069 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5070 ip6_template_metrics, true);
5073 net->ipv6.sysctl.flush_delay = 0;
5074 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5075 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5076 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5077 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5078 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5079 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5080 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5082 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5088 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5089 out_ip6_prohibit_entry:
5090 kfree(net->ipv6.ip6_prohibit_entry);
5092 kfree(net->ipv6.ip6_null_entry);
5094 out_fib6_null_entry:
5095 kfree(net->ipv6.fib6_null_entry);
5096 out_ip6_dst_entries:
5097 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5102 static void __net_exit ip6_route_net_exit(struct net *net)
5104 kfree(net->ipv6.fib6_null_entry);
5105 kfree(net->ipv6.ip6_null_entry);
5106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5107 kfree(net->ipv6.ip6_prohibit_entry);
5108 kfree(net->ipv6.ip6_blk_hole_entry);
5110 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5113 static int __net_init ip6_route_net_init_late(struct net *net)
5115 #ifdef CONFIG_PROC_FS
5116 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5117 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5122 static void __net_exit ip6_route_net_exit_late(struct net *net)
5124 #ifdef CONFIG_PROC_FS
5125 remove_proc_entry("ipv6_route", net->proc_net);
5126 remove_proc_entry("rt6_stats", net->proc_net);
5130 static struct pernet_operations ip6_route_net_ops = {
5131 .init = ip6_route_net_init,
5132 .exit = ip6_route_net_exit,
5135 static int __net_init ipv6_inetpeer_init(struct net *net)
5137 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5141 inet_peer_base_init(bp);
5142 net->ipv6.peers = bp;
5146 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5148 struct inet_peer_base *bp = net->ipv6.peers;
5150 net->ipv6.peers = NULL;
5151 inetpeer_invalidate_tree(bp);
5155 static struct pernet_operations ipv6_inetpeer_ops = {
5156 .init = ipv6_inetpeer_init,
5157 .exit = ipv6_inetpeer_exit,
5160 static struct pernet_operations ip6_route_net_late_ops = {
5161 .init = ip6_route_net_init_late,
5162 .exit = ip6_route_net_exit_late,
5165 static struct notifier_block ip6_route_dev_notifier = {
5166 .notifier_call = ip6_route_dev_notify,
5167 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5170 void __init ip6_route_init_special_entries(void)
5172 /* Registering of the loopback is done before this portion of code,
5173 * the loopback reference in rt6_info will not be taken, do it
5174 * manually for init_net */
5175 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5176 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5177 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5178 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5179 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5180 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5181 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5182 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5186 int __init ip6_route_init(void)
5192 ip6_dst_ops_template.kmem_cachep =
5193 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5194 SLAB_HWCACHE_ALIGN, NULL);
5195 if (!ip6_dst_ops_template.kmem_cachep)
5198 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5200 goto out_kmem_cache;
5202 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5204 goto out_dst_entries;
5206 ret = register_pernet_subsys(&ip6_route_net_ops);
5208 goto out_register_inetpeer;
5210 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5214 goto out_register_subsys;
5220 ret = fib6_rules_init();
5224 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5226 goto fib6_rules_init;
5228 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5229 inet6_rtm_newroute, NULL, 0);
5231 goto out_register_late_subsys;
5233 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5234 inet6_rtm_delroute, NULL, 0);
5236 goto out_register_late_subsys;
5238 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5239 inet6_rtm_getroute, NULL,
5240 RTNL_FLAG_DOIT_UNLOCKED);
5242 goto out_register_late_subsys;
5244 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5246 goto out_register_late_subsys;
5248 for_each_possible_cpu(cpu) {
5249 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5251 INIT_LIST_HEAD(&ul->head);
5252 spin_lock_init(&ul->lock);
5258 out_register_late_subsys:
5259 rtnl_unregister_all(PF_INET6);
5260 unregister_pernet_subsys(&ip6_route_net_late_ops);
5262 fib6_rules_cleanup();
5267 out_register_subsys:
5268 unregister_pernet_subsys(&ip6_route_net_ops);
5269 out_register_inetpeer:
5270 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5272 dst_entries_destroy(&ip6_dst_blackhole_ops);
5274 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5278 void ip6_route_cleanup(void)
5280 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5281 unregister_pernet_subsys(&ip6_route_net_late_ops);
5282 fib6_rules_cleanup();
5285 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5286 unregister_pernet_subsys(&ip6_route_net_ops);
5287 dst_entries_destroy(&ip6_dst_blackhole_ops);
5288 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);