]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ipv6: Refactor __ip6_route_redirect
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106                            int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109                          struct fib6_info *rt, struct dst_entry *dst,
110                          struct in6_addr *dest, struct in6_addr *src,
111                          int iif, int type, u32 portid, u32 seq,
112                          unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114                                            struct in6_addr *daddr,
115                                            struct in6_addr *saddr);
116
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev,
122                                            unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124                                            const struct in6_addr *prefix, int prefixlen,
125                                            const struct in6_addr *gwaddr,
126                                            struct net_device *dev);
127 #endif
128
129 struct uncached_list {
130         spinlock_t              lock;
131         struct list_head        head;
132 };
133
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139
140         rt->rt6i_uncached_list = ul;
141
142         spin_lock_bh(&ul->lock);
143         list_add_tail(&rt->rt6i_uncached, &ul->head);
144         spin_unlock_bh(&ul->lock);
145 }
146
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149         if (!list_empty(&rt->rt6i_uncached)) {
150                 struct uncached_list *ul = rt->rt6i_uncached_list;
151                 struct net *net = dev_net(rt->dst.dev);
152
153                 spin_lock_bh(&ul->lock);
154                 list_del(&rt->rt6i_uncached);
155                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156                 spin_unlock_bh(&ul->lock);
157         }
158 }
159
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162         struct net_device *loopback_dev = net->loopback_dev;
163         int cpu;
164
165         if (dev == loopback_dev)
166                 return;
167
168         for_each_possible_cpu(cpu) {
169                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170                 struct rt6_info *rt;
171
172                 spin_lock_bh(&ul->lock);
173                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174                         struct inet6_dev *rt_idev = rt->rt6i_idev;
175                         struct net_device *rt_dev = rt->dst.dev;
176
177                         if (rt_idev->dev == dev) {
178                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
179                                 in6_dev_put(rt_idev);
180                         }
181
182                         if (rt_dev == dev) {
183                                 rt->dst.dev = loopback_dev;
184                                 dev_hold(rt->dst.dev);
185                                 dev_put(rt_dev);
186                         }
187                 }
188                 spin_unlock_bh(&ul->lock);
189         }
190 }
191
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193                                              struct sk_buff *skb,
194                                              const void *daddr)
195 {
196         if (!ipv6_addr_any(p))
197                 return (const void *) p;
198         else if (skb)
199                 return &ipv6_hdr(skb)->daddr;
200         return daddr;
201 }
202
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204                                    struct net_device *dev,
205                                    struct sk_buff *skb,
206                                    const void *daddr)
207 {
208         struct neighbour *n;
209
210         daddr = choose_neigh_daddr(gw, skb, daddr);
211         n = __ipv6_neigh_lookup(dev, daddr);
212         if (n)
213                 return n;
214
215         n = neigh_create(&nd_tbl, daddr, dev);
216         return IS_ERR(n) ? NULL : n;
217 }
218
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220                                               struct sk_buff *skb,
221                                               const void *daddr)
222 {
223         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224
225         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230         struct net_device *dev = dst->dev;
231         struct rt6_info *rt = (struct rt6_info *)dst;
232
233         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234         if (!daddr)
235                 return;
236         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237                 return;
238         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239                 return;
240         __ipv6_confirm_neigh(dev, daddr);
241 }
242
243 static struct dst_ops ip6_dst_ops_template = {
244         .family                 =       AF_INET6,
245         .gc                     =       ip6_dst_gc,
246         .gc_thresh              =       1024,
247         .check                  =       ip6_dst_check,
248         .default_advmss         =       ip6_default_advmss,
249         .mtu                    =       ip6_mtu,
250         .cow_metrics            =       dst_cow_metrics_generic,
251         .destroy                =       ip6_dst_destroy,
252         .ifdown                 =       ip6_dst_ifdown,
253         .negative_advice        =       ip6_negative_advice,
254         .link_failure           =       ip6_link_failure,
255         .update_pmtu            =       ip6_rt_update_pmtu,
256         .redirect               =       rt6_do_redirect,
257         .local_out              =       __ip6_local_out,
258         .neigh_lookup           =       ip6_dst_neigh_lookup,
259         .confirm_neigh          =       ip6_confirm_neigh,
260 };
261
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265
266         return mtu ? : dst->dev->mtu;
267 }
268
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270                                          struct sk_buff *skb, u32 mtu)
271 {
272 }
273
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275                                       struct sk_buff *skb)
276 {
277 }
278
279 static struct dst_ops ip6_dst_blackhole_ops = {
280         .family                 =       AF_INET6,
281         .destroy                =       ip6_dst_destroy,
282         .check                  =       ip6_dst_check,
283         .mtu                    =       ip6_blackhole_mtu,
284         .default_advmss         =       ip6_default_advmss,
285         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
286         .redirect               =       ip6_rt_blackhole_redirect,
287         .cow_metrics            =       dst_cow_metrics_generic,
288         .neigh_lookup           =       ip6_dst_neigh_lookup,
289 };
290
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292         [RTAX_HOPLIMIT - 1] = 0,
293 };
294
295 static const struct fib6_info fib6_null_entry_template = {
296         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .fib6_protocol  = RTPROT_KERNEL,
298         .fib6_metric    = ~(u32)0,
299         .fib6_ref       = ATOMIC_INIT(1),
300         .fib6_type      = RTN_UNREACHABLE,
301         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
302 };
303
304 static const struct rt6_info ip6_null_entry_template = {
305         .dst = {
306                 .__refcnt       = ATOMIC_INIT(1),
307                 .__use          = 1,
308                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
309                 .error          = -ENETUNREACH,
310                 .input          = ip6_pkt_discard,
311                 .output         = ip6_pkt_discard_out,
312         },
313         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
314 };
315
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317
318 static const struct rt6_info ip6_prohibit_entry_template = {
319         .dst = {
320                 .__refcnt       = ATOMIC_INIT(1),
321                 .__use          = 1,
322                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
323                 .error          = -EACCES,
324                 .input          = ip6_pkt_prohibit,
325                 .output         = ip6_pkt_prohibit_out,
326         },
327         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340 };
341
342 #endif
343
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346         struct dst_entry *dst = &rt->dst;
347
348         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349         INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354                                int flags)
355 {
356         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357                                         1, DST_OBSOLETE_FORCE_CHK, flags);
358
359         if (rt) {
360                 rt6_info_init(rt);
361                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362         }
363
364         return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370         struct rt6_info *rt = (struct rt6_info *)dst;
371         struct fib6_info *from;
372         struct inet6_dev *idev;
373
374         ip_dst_metrics_put(dst);
375         rt6_uncached_list_del(rt);
376
377         idev = rt->rt6i_idev;
378         if (idev) {
379                 rt->rt6i_idev = NULL;
380                 in6_dev_put(idev);
381         }
382
383         rcu_read_lock();
384         from = rcu_dereference(rt->from);
385         rcu_assign_pointer(rt->from, NULL);
386         fib6_info_release(from);
387         rcu_read_unlock();
388 }
389
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391                            int how)
392 {
393         struct rt6_info *rt = (struct rt6_info *)dst;
394         struct inet6_dev *idev = rt->rt6i_idev;
395         struct net_device *loopback_dev =
396                 dev_net(dev)->loopback_dev;
397
398         if (idev && idev->dev != loopback_dev) {
399                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400                 if (loopback_idev) {
401                         rt->rt6i_idev = loopback_idev;
402                         in6_dev_put(idev);
403                 }
404         }
405 }
406
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409         if (rt->rt6i_flags & RTF_EXPIRES)
410                 return time_after(jiffies, rt->dst.expires);
411         else
412                 return false;
413 }
414
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417         struct fib6_info *from;
418
419         from = rcu_dereference(rt->from);
420
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (from) {
425                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426                         fib6_check_expired(from);
427         }
428         return false;
429 }
430
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432                                         struct fib6_info *match,
433                                         struct flowi6 *fl6, int oif,
434                                         const struct sk_buff *skb,
435                                         int strict)
436 {
437         struct fib6_info *sibling, *next_sibling;
438
439         /* We might have already computed the hash for ICMPv6 errors. In such
440          * case it will always be non-zero. Otherwise now is the time to do it.
441          */
442         if (!fl6->mp_hash)
443                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444
445         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
446                 return match;
447
448         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449                                  fib6_siblings) {
450                 const struct fib6_nh *nh = &sibling->fib6_nh;
451                 int nh_upper_bound;
452
453                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454                 if (fl6->mp_hash > nh_upper_bound)
455                         continue;
456                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
457                         break;
458                 match = sibling;
459                 break;
460         }
461
462         return match;
463 }
464
465 /*
466  *      Route lookup. rcu_read_lock() should be held.
467  */
468
469 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
470                                const struct in6_addr *saddr, int oif, int flags)
471 {
472         const struct net_device *dev;
473
474         if (nh->fib_nh_flags & RTNH_F_DEAD)
475                 return false;
476
477         dev = nh->fib_nh_dev;
478         if (oif) {
479                 if (dev->ifindex == oif)
480                         return true;
481         } else {
482                 if (ipv6_chk_addr(net, saddr, dev,
483                                   flags & RT6_LOOKUP_F_IFACE))
484                         return true;
485         }
486
487         return false;
488 }
489
490 static inline struct fib6_info *rt6_device_match(struct net *net,
491                                                  struct fib6_info *rt,
492                                                     const struct in6_addr *saddr,
493                                                     int oif,
494                                                     int flags)
495 {
496         const struct fib6_nh *nh;
497         struct fib6_info *sprt;
498
499         if (!oif && ipv6_addr_any(saddr) &&
500             !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
501                 return rt;
502
503         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
504                 nh = &sprt->fib6_nh;
505                 if (__rt6_device_match(net, nh, saddr, oif, flags))
506                         return sprt;
507         }
508
509         if (oif && flags & RT6_LOOKUP_F_IFACE)
510                 return net->ipv6.fib6_null_entry;
511
512         return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
513 }
514
515 #ifdef CONFIG_IPV6_ROUTER_PREF
516 struct __rt6_probe_work {
517         struct work_struct work;
518         struct in6_addr target;
519         struct net_device *dev;
520 };
521
522 static void rt6_probe_deferred(struct work_struct *w)
523 {
524         struct in6_addr mcaddr;
525         struct __rt6_probe_work *work =
526                 container_of(w, struct __rt6_probe_work, work);
527
528         addrconf_addr_solict_mult(&work->target, &mcaddr);
529         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
530         dev_put(work->dev);
531         kfree(work);
532 }
533
534 static void rt6_probe(struct fib6_nh *fib6_nh)
535 {
536         struct __rt6_probe_work *work = NULL;
537         const struct in6_addr *nh_gw;
538         struct neighbour *neigh;
539         struct net_device *dev;
540         struct inet6_dev *idev;
541
542         /*
543          * Okay, this does not seem to be appropriate
544          * for now, however, we need to check if it
545          * is really so; aka Router Reachability Probing.
546          *
547          * Router Reachability Probe MUST be rate-limited
548          * to no more than one per minute.
549          */
550         if (fib6_nh->fib_nh_gw_family)
551                 return;
552
553         nh_gw = &fib6_nh->fib_nh_gw6;
554         dev = fib6_nh->fib_nh_dev;
555         rcu_read_lock_bh();
556         idev = __in6_dev_get(dev);
557         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
558         if (neigh) {
559                 if (neigh->nud_state & NUD_VALID)
560                         goto out;
561
562                 write_lock(&neigh->lock);
563                 if (!(neigh->nud_state & NUD_VALID) &&
564                     time_after(jiffies,
565                                neigh->updated + idev->cnf.rtr_probe_interval)) {
566                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
567                         if (work)
568                                 __neigh_set_probe_once(neigh);
569                 }
570                 write_unlock(&neigh->lock);
571         } else if (time_after(jiffies, fib6_nh->last_probe +
572                                        idev->cnf.rtr_probe_interval)) {
573                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574         }
575
576         if (work) {
577                 fib6_nh->last_probe = jiffies;
578                 INIT_WORK(&work->work, rt6_probe_deferred);
579                 work->target = *nh_gw;
580                 dev_hold(dev);
581                 work->dev = dev;
582                 schedule_work(&work->work);
583         }
584
585 out:
586         rcu_read_unlock_bh();
587 }
588 #else
589 static inline void rt6_probe(struct fib6_nh *fib6_nh)
590 {
591 }
592 #endif
593
594 /*
595  * Default Router Selection (RFC 2461 6.3.6)
596  */
597 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
598 {
599         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
600         struct neighbour *neigh;
601
602         rcu_read_lock_bh();
603         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
604                                           &fib6_nh->fib_nh_gw6);
605         if (neigh) {
606                 read_lock(&neigh->lock);
607                 if (neigh->nud_state & NUD_VALID)
608                         ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610                 else if (!(neigh->nud_state & NUD_FAILED))
611                         ret = RT6_NUD_SUCCEED;
612                 else
613                         ret = RT6_NUD_FAIL_PROBE;
614 #endif
615                 read_unlock(&neigh->lock);
616         } else {
617                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619         }
620         rcu_read_unlock_bh();
621
622         return ret;
623 }
624
625 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
626                            int strict)
627 {
628         int m = 0;
629
630         if (!oif || nh->fib_nh_dev->ifindex == oif)
631                 m = 2;
632
633         if (!m && (strict & RT6_LOOKUP_F_IFACE))
634                 return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
637 #endif
638         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
639             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
640                 int n = rt6_check_neigh(nh);
641                 if (n < 0)
642                         return n;
643         }
644         return m;
645 }
646
647 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
648                        int oif, int strict, int *mpri, bool *do_rr)
649 {
650         bool match_do_rr = false;
651         bool rc = false;
652         int m;
653
654         if (nh->fib_nh_flags & RTNH_F_DEAD)
655                 goto out;
656
657         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
658             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
659             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
660                 goto out;
661
662         m = rt6_score_route(nh, fib6_flags, oif, strict);
663         if (m == RT6_NUD_FAIL_DO_RR) {
664                 match_do_rr = true;
665                 m = 0; /* lowest valid score */
666         } else if (m == RT6_NUD_FAIL_HARD) {
667                 goto out;
668         }
669
670         if (strict & RT6_LOOKUP_F_REACHABLE)
671                 rt6_probe(nh);
672
673         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
674         if (m > *mpri) {
675                 *do_rr = match_do_rr;
676                 *mpri = m;
677                 rc = true;
678         }
679 out:
680         return rc;
681 }
682
683 static void __find_rr_leaf(struct fib6_info *rt_start,
684                            struct fib6_info *nomatch, u32 metric,
685                            struct fib6_info **match, struct fib6_info **cont,
686                            int oif, int strict, bool *do_rr, int *mpri)
687 {
688         struct fib6_info *rt;
689
690         for (rt = rt_start;
691              rt && rt != nomatch;
692              rt = rcu_dereference(rt->fib6_next)) {
693                 struct fib6_nh *nh;
694
695                 if (cont && rt->fib6_metric != metric) {
696                         *cont = rt;
697                         return;
698                 }
699
700                 if (fib6_check_expired(rt))
701                         continue;
702
703                 nh = &rt->fib6_nh;
704                 if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
705                         *match = rt;
706         }
707 }
708
709 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
710                                       struct fib6_info *leaf,
711                                       struct fib6_info *rr_head,
712                                       u32 metric, int oif, int strict,
713                                       bool *do_rr)
714 {
715         struct fib6_info *match = NULL, *cont = NULL;
716         int mpri = -1;
717
718         __find_rr_leaf(rr_head, NULL, metric, &match, &cont,
719                        oif, strict, do_rr, &mpri);
720
721         __find_rr_leaf(leaf, rr_head, metric, &match, &cont,
722                        oif, strict, do_rr, &mpri);
723
724         if (match || !cont)
725                 return match;
726
727         __find_rr_leaf(cont, NULL, metric, &match, NULL,
728                        oif, strict, do_rr, &mpri);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         if (ort->fib6_flags & RTF_REJECT) {
950                 ip6_rt_init_dst_reject(rt, ort);
951                 return;
952         }
953
954         rt->dst.error = 0;
955         rt->dst.output = ip6_output;
956
957         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958                 rt->dst.input = ip6_input;
959         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960                 rt->dst.input = ip6_mc_input;
961         } else {
962                 rt->dst.input = ip6_forward;
963         }
964
965         if (ort->fib6_nh.fib_nh_lws) {
966                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
967                 lwtunnel_set_redirect(&rt->dst);
968         }
969
970         rt->dst.lastuse = jiffies;
971 }
972
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976         rt->rt6i_flags &= ~RTF_EXPIRES;
977         rcu_assign_pointer(rt->from, from);
978         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984         struct net_device *dev = fib6_info_nh_dev(ort);
985
986         ip6_rt_init_dst(rt, ort);
987
988         rt->rt6i_dst = ort->fib6_dst;
989         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990         rt->rt6i_flags = ort->fib6_flags;
991         if (ort->fib6_nh.fib_nh_gw_family) {
992                 rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
993                 rt->rt6i_flags |= RTF_GATEWAY;
994         }
995         rt6_set_from(rt, ort);
996 #ifdef CONFIG_IPV6_SUBTREES
997         rt->rt6i_src = ort->fib6_src;
998 #endif
999 }
1000
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002                                         struct in6_addr *saddr)
1003 {
1004         struct fib6_node *pn, *sn;
1005         while (1) {
1006                 if (fn->fn_flags & RTN_TL_ROOT)
1007                         return NULL;
1008                 pn = rcu_dereference(fn->parent);
1009                 sn = FIB6_SUBTREE(pn);
1010                 if (sn && sn != fn)
1011                         fn = fib6_node_lookup(sn, NULL, saddr);
1012                 else
1013                         fn = pn;
1014                 if (fn->fn_flags & RTN_RTINFO)
1015                         return fn;
1016         }
1017 }
1018
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (net) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 goto fallback;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (!nrt) {
1047                 fib6_info_release(rt);
1048                 goto fallback;
1049         }
1050
1051         ip6_rt_copy_init(nrt, rt);
1052         return nrt;
1053
1054 fallback:
1055         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056         dst_hold(&nrt->dst);
1057         return nrt;
1058 }
1059
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061                                              struct fib6_table *table,
1062                                              struct flowi6 *fl6,
1063                                              const struct sk_buff *skb,
1064                                              int flags)
1065 {
1066         struct fib6_info *f6i;
1067         struct fib6_node *fn;
1068         struct rt6_info *rt;
1069
1070         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071                 flags &= ~RT6_LOOKUP_F_IFACE;
1072
1073         rcu_read_lock();
1074         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076         f6i = rcu_dereference(fn->leaf);
1077         if (!f6i)
1078                 f6i = net->ipv6.fib6_null_entry;
1079         else
1080                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081                                       fl6->flowi6_oif, flags);
1082
1083         if (f6i == net->ipv6.fib6_null_entry) {
1084                 fn = fib6_backtrack(fn, &fl6->saddr);
1085                 if (fn)
1086                         goto restart;
1087
1088                 rt = net->ipv6.ip6_null_entry;
1089                 dst_hold(&rt->dst);
1090                 goto out;
1091         }
1092
1093         if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1094                 f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1095                                             flags);
1096         /* Search through exception table */
1097         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1098         if (rt) {
1099                 if (ip6_hold_safe(net, &rt))
1100                         dst_use_noref(&rt->dst, jiffies);
1101         } else {
1102                 rt = ip6_create_rt_rcu(f6i);
1103         }
1104
1105 out:
1106         trace_fib6_table_lookup(net, f6i, table, fl6);
1107
1108         rcu_read_unlock();
1109
1110         return rt;
1111 }
1112
1113 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1114                                    const struct sk_buff *skb, int flags)
1115 {
1116         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 }
1118 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119
1120 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1121                             const struct in6_addr *saddr, int oif,
1122                             const struct sk_buff *skb, int strict)
1123 {
1124         struct flowi6 fl6 = {
1125                 .flowi6_oif = oif,
1126                 .daddr = *daddr,
1127         };
1128         struct dst_entry *dst;
1129         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1130
1131         if (saddr) {
1132                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1133                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1134         }
1135
1136         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1137         if (dst->error == 0)
1138                 return (struct rt6_info *) dst;
1139
1140         dst_release(dst);
1141
1142         return NULL;
1143 }
1144 EXPORT_SYMBOL(rt6_lookup);
1145
1146 /* ip6_ins_rt is called with FREE table->tb6_lock.
1147  * It takes new route entry, the addition fails by any reason the
1148  * route is released.
1149  * Caller must hold dst before calling it.
1150  */
1151
1152 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1153                         struct netlink_ext_ack *extack)
1154 {
1155         int err;
1156         struct fib6_table *table;
1157
1158         table = rt->fib6_table;
1159         spin_lock_bh(&table->tb6_lock);
1160         err = fib6_add(&table->tb6_root, rt, info, extack);
1161         spin_unlock_bh(&table->tb6_lock);
1162
1163         return err;
1164 }
1165
1166 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 {
1168         struct nl_info info = { .nl_net = net, };
1169
1170         return __ip6_ins_rt(rt, &info, NULL);
1171 }
1172
1173 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1174                                            const struct in6_addr *daddr,
1175                                            const struct in6_addr *saddr)
1176 {
1177         struct net_device *dev;
1178         struct rt6_info *rt;
1179
1180         /*
1181          *      Clone the route.
1182          */
1183
1184         if (!fib6_info_hold_safe(ort))
1185                 return NULL;
1186
1187         dev = ip6_rt_get_dev_rcu(ort);
1188         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189         if (!rt) {
1190                 fib6_info_release(ort);
1191                 return NULL;
1192         }
1193
1194         ip6_rt_copy_init(rt, ort);
1195         rt->rt6i_flags |= RTF_CACHE;
1196         rt->dst.flags |= DST_HOST;
1197         rt->rt6i_dst.addr = *daddr;
1198         rt->rt6i_dst.plen = 128;
1199
1200         if (!rt6_is_gw_or_nonexthop(ort)) {
1201                 if (ort->fib6_dst.plen != 128 &&
1202                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1203                         rt->rt6i_flags |= RTF_ANYCAST;
1204 #ifdef CONFIG_IPV6_SUBTREES
1205                 if (rt->rt6i_src.plen && saddr) {
1206                         rt->rt6i_src.addr = *saddr;
1207                         rt->rt6i_src.plen = 128;
1208                 }
1209 #endif
1210         }
1211
1212         return rt;
1213 }
1214
1215 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 {
1217         unsigned short flags = fib6_info_dst_flags(rt);
1218         struct net_device *dev;
1219         struct rt6_info *pcpu_rt;
1220
1221         if (!fib6_info_hold_safe(rt))
1222                 return NULL;
1223
1224         rcu_read_lock();
1225         dev = ip6_rt_get_dev_rcu(rt);
1226         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1227         rcu_read_unlock();
1228         if (!pcpu_rt) {
1229                 fib6_info_release(rt);
1230                 return NULL;
1231         }
1232         ip6_rt_copy_init(pcpu_rt, rt);
1233         pcpu_rt->rt6i_flags |= RTF_PCPU;
1234         return pcpu_rt;
1235 }
1236
1237 /* It should be called with rcu_read_lock() acquired */
1238 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 {
1240         struct rt6_info *pcpu_rt, **p;
1241
1242         p = this_cpu_ptr(rt->rt6i_pcpu);
1243         pcpu_rt = *p;
1244
1245         if (pcpu_rt)
1246                 ip6_hold_safe(NULL, &pcpu_rt);
1247
1248         return pcpu_rt;
1249 }
1250
1251 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1252                                             struct fib6_info *rt)
1253 {
1254         struct rt6_info *pcpu_rt, *prev, **p;
1255
1256         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257         if (!pcpu_rt) {
1258                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1259                 return net->ipv6.ip6_null_entry;
1260         }
1261
1262         dst_hold(&pcpu_rt->dst);
1263         p = this_cpu_ptr(rt->rt6i_pcpu);
1264         prev = cmpxchg(p, NULL, pcpu_rt);
1265         BUG_ON(prev);
1266
1267         return pcpu_rt;
1268 }
1269
1270 /* exception hash table implementation
1271  */
1272 static DEFINE_SPINLOCK(rt6_exception_lock);
1273
1274 /* Remove rt6_ex from hash table and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1278                                  struct rt6_exception *rt6_ex)
1279 {
1280         struct fib6_info *from;
1281         struct net *net;
1282
1283         if (!bucket || !rt6_ex)
1284                 return;
1285
1286         net = dev_net(rt6_ex->rt6i->dst.dev);
1287         net->ipv6.rt6_stats->fib_rt_cache--;
1288
1289         /* purge completely the exception to allow releasing the held resources:
1290          * some [sk] cache may keep the dst around for unlimited time
1291          */
1292         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1293                                          lockdep_is_held(&rt6_exception_lock));
1294         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1295         fib6_info_release(from);
1296         dst_dev_put(&rt6_ex->rt6i->dst);
1297
1298         hlist_del_rcu(&rt6_ex->hlist);
1299         dst_release(&rt6_ex->rt6i->dst);
1300         kfree_rcu(rt6_ex, rcu);
1301         WARN_ON_ONCE(!bucket->depth);
1302         bucket->depth--;
1303 }
1304
1305 /* Remove oldest rt6_ex in bucket and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1309 {
1310         struct rt6_exception *rt6_ex, *oldest = NULL;
1311
1312         if (!bucket)
1313                 return;
1314
1315         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1316                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1317                         oldest = rt6_ex;
1318         }
1319         rt6_remove_exception(bucket, oldest);
1320 }
1321
1322 static u32 rt6_exception_hash(const struct in6_addr *dst,
1323                               const struct in6_addr *src)
1324 {
1325         static u32 seed __read_mostly;
1326         u32 val;
1327
1328         net_get_random_once(&seed, sizeof(seed));
1329         val = jhash(dst, sizeof(*dst), seed);
1330
1331 #ifdef CONFIG_IPV6_SUBTREES
1332         if (src)
1333                 val = jhash(src, sizeof(*src), val);
1334 #endif
1335         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1336 }
1337
1338 /* Helper function to find the cached rt in the hash table
1339  * and update bucket pointer to point to the bucket for this
1340  * (daddr, saddr) pair
1341  * Caller must hold rt6_exception_lock
1342  */
1343 static struct rt6_exception *
1344 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1345                               const struct in6_addr *daddr,
1346                               const struct in6_addr *saddr)
1347 {
1348         struct rt6_exception *rt6_ex;
1349         u32 hval;
1350
1351         if (!(*bucket) || !daddr)
1352                 return NULL;
1353
1354         hval = rt6_exception_hash(daddr, saddr);
1355         *bucket += hval;
1356
1357         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1358                 struct rt6_info *rt6 = rt6_ex->rt6i;
1359                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360
1361 #ifdef CONFIG_IPV6_SUBTREES
1362                 if (matched && saddr)
1363                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1364 #endif
1365                 if (matched)
1366                         return rt6_ex;
1367         }
1368         return NULL;
1369 }
1370
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rcu_read_lock()
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1378                          const struct in6_addr *daddr,
1379                          const struct in6_addr *saddr)
1380 {
1381         struct rt6_exception *rt6_ex;
1382         u32 hval;
1383
1384         WARN_ON_ONCE(!rcu_read_lock_held());
1385
1386         if (!(*bucket) || !daddr)
1387                 return NULL;
1388
1389         hval = rt6_exception_hash(daddr, saddr);
1390         *bucket += hval;
1391
1392         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1393                 struct rt6_info *rt6 = rt6_ex->rt6i;
1394                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397                 if (matched && saddr)
1398                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1399 #endif
1400                 if (matched)
1401                         return rt6_ex;
1402         }
1403         return NULL;
1404 }
1405
1406 static unsigned int fib6_mtu(const struct fib6_info *rt)
1407 {
1408         unsigned int mtu;
1409
1410         if (rt->fib6_pmtu) {
1411                 mtu = rt->fib6_pmtu;
1412         } else {
1413                 struct net_device *dev = fib6_info_nh_dev(rt);
1414                 struct inet6_dev *idev;
1415
1416                 rcu_read_lock();
1417                 idev = __in6_dev_get(dev);
1418                 mtu = idev->cnf.mtu6;
1419                 rcu_read_unlock();
1420         }
1421
1422         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1423
1424         return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1425 }
1426
1427 static int rt6_insert_exception(struct rt6_info *nrt,
1428                                 struct fib6_info *ort)
1429 {
1430         struct net *net = dev_net(nrt->dst.dev);
1431         struct rt6_exception_bucket *bucket;
1432         struct in6_addr *src_key = NULL;
1433         struct rt6_exception *rt6_ex;
1434         int err = 0;
1435
1436         spin_lock_bh(&rt6_exception_lock);
1437
1438         if (ort->exception_bucket_flushed) {
1439                 err = -EINVAL;
1440                 goto out;
1441         }
1442
1443         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1444                                         lockdep_is_held(&rt6_exception_lock));
1445         if (!bucket) {
1446                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1447                                  GFP_ATOMIC);
1448                 if (!bucket) {
1449                         err = -ENOMEM;
1450                         goto out;
1451                 }
1452                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1453         }
1454
1455 #ifdef CONFIG_IPV6_SUBTREES
1456         /* rt6i_src.plen != 0 indicates ort is in subtree
1457          * and exception table is indexed by a hash of
1458          * both rt6i_dst and rt6i_src.
1459          * Otherwise, the exception table is indexed by
1460          * a hash of only rt6i_dst.
1461          */
1462         if (ort->fib6_src.plen)
1463                 src_key = &nrt->rt6i_src.addr;
1464 #endif
1465         /* rt6_mtu_change() might lower mtu on ort.
1466          * Only insert this exception route if its mtu
1467          * is less than ort's mtu value.
1468          */
1469         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1470                 err = -EINVAL;
1471                 goto out;
1472         }
1473
1474         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1475                                                src_key);
1476         if (rt6_ex)
1477                 rt6_remove_exception(bucket, rt6_ex);
1478
1479         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1480         if (!rt6_ex) {
1481                 err = -ENOMEM;
1482                 goto out;
1483         }
1484         rt6_ex->rt6i = nrt;
1485         rt6_ex->stamp = jiffies;
1486         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1487         bucket->depth++;
1488         net->ipv6.rt6_stats->fib_rt_cache++;
1489
1490         if (bucket->depth > FIB6_MAX_DEPTH)
1491                 rt6_exception_remove_oldest(bucket);
1492
1493 out:
1494         spin_unlock_bh(&rt6_exception_lock);
1495
1496         /* Update fn->fn_sernum to invalidate all cached dst */
1497         if (!err) {
1498                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1499                 fib6_update_sernum(net, ort);
1500                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1501                 fib6_force_start_gc(net);
1502         }
1503
1504         return err;
1505 }
1506
1507 void rt6_flush_exceptions(struct fib6_info *rt)
1508 {
1509         struct rt6_exception_bucket *bucket;
1510         struct rt6_exception *rt6_ex;
1511         struct hlist_node *tmp;
1512         int i;
1513
1514         spin_lock_bh(&rt6_exception_lock);
1515         /* Prevent rt6_insert_exception() to recreate the bucket list */
1516         rt->exception_bucket_flushed = 1;
1517
1518         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519                                     lockdep_is_held(&rt6_exception_lock));
1520         if (!bucket)
1521                 goto out;
1522
1523         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1525                         rt6_remove_exception(bucket, rt6_ex);
1526                 WARN_ON_ONCE(bucket->depth);
1527                 bucket++;
1528         }
1529
1530 out:
1531         spin_unlock_bh(&rt6_exception_lock);
1532 }
1533
1534 /* Find cached rt in the hash table inside passed in rt
1535  * Caller has to hold rcu_read_lock()
1536  */
1537 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1538                                            struct in6_addr *daddr,
1539                                            struct in6_addr *saddr)
1540 {
1541         struct rt6_exception_bucket *bucket;
1542         struct in6_addr *src_key = NULL;
1543         struct rt6_exception *rt6_ex;
1544         struct rt6_info *res = NULL;
1545
1546         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1547
1548 #ifdef CONFIG_IPV6_SUBTREES
1549         /* rt6i_src.plen != 0 indicates rt is in subtree
1550          * and exception table is indexed by a hash of
1551          * both rt6i_dst and rt6i_src.
1552          * Otherwise, the exception table is indexed by
1553          * a hash of only rt6i_dst.
1554          */
1555         if (rt->fib6_src.plen)
1556                 src_key = saddr;
1557 #endif
1558         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1559
1560         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1561                 res = rt6_ex->rt6i;
1562
1563         return res;
1564 }
1565
1566 /* Remove the passed in cached rt from the hash table that contains it */
1567 static int rt6_remove_exception_rt(struct rt6_info *rt)
1568 {
1569         struct rt6_exception_bucket *bucket;
1570         struct in6_addr *src_key = NULL;
1571         struct rt6_exception *rt6_ex;
1572         struct fib6_info *from;
1573         int err;
1574
1575         from = rcu_dereference(rt->from);
1576         if (!from ||
1577             !(rt->rt6i_flags & RTF_CACHE))
1578                 return -EINVAL;
1579
1580         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1581                 return -ENOENT;
1582
1583         spin_lock_bh(&rt6_exception_lock);
1584         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1585                                     lockdep_is_held(&rt6_exception_lock));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1588          * and exception table is indexed by a hash of
1589          * both rt6i_dst and rt6i_src.
1590          * Otherwise, the exception table is indexed by
1591          * a hash of only rt6i_dst.
1592          */
1593         if (from->fib6_src.plen)
1594                 src_key = &rt->rt6i_src.addr;
1595 #endif
1596         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1597                                                &rt->rt6i_dst.addr,
1598                                                src_key);
1599         if (rt6_ex) {
1600                 rt6_remove_exception(bucket, rt6_ex);
1601                 err = 0;
1602         } else {
1603                 err = -ENOENT;
1604         }
1605
1606         spin_unlock_bh(&rt6_exception_lock);
1607         return err;
1608 }
1609
1610 /* Find rt6_ex which contains the passed in rt cache and
1611  * refresh its stamp
1612  */
1613 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1614 {
1615         struct rt6_exception_bucket *bucket;
1616         struct in6_addr *src_key = NULL;
1617         struct rt6_exception *rt6_ex;
1618         struct fib6_info *from;
1619
1620         rcu_read_lock();
1621         from = rcu_dereference(rt->from);
1622         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1623                 goto unlock;
1624
1625         bucket = rcu_dereference(from->rt6i_exception_bucket);
1626
1627 #ifdef CONFIG_IPV6_SUBTREES
1628         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1629          * and exception table is indexed by a hash of
1630          * both rt6i_dst and rt6i_src.
1631          * Otherwise, the exception table is indexed by
1632          * a hash of only rt6i_dst.
1633          */
1634         if (from->fib6_src.plen)
1635                 src_key = &rt->rt6i_src.addr;
1636 #endif
1637         rt6_ex = __rt6_find_exception_rcu(&bucket,
1638                                           &rt->rt6i_dst.addr,
1639                                           src_key);
1640         if (rt6_ex)
1641                 rt6_ex->stamp = jiffies;
1642
1643 unlock:
1644         rcu_read_unlock();
1645 }
1646
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648                                          struct rt6_info *rt, int mtu)
1649 {
1650         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1651          * lowest MTU in the path: always allow updating the route PMTU to
1652          * reflect PMTU decreases.
1653          *
1654          * If the new MTU is higher, and the route PMTU is equal to the local
1655          * MTU, this means the old MTU is the lowest in the path, so allow
1656          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657          * handle this.
1658          */
1659
1660         if (dst_mtu(&rt->dst) >= mtu)
1661                 return true;
1662
1663         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1664                 return true;
1665
1666         return false;
1667 }
1668
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670                                        struct fib6_info *rt, int mtu)
1671 {
1672         struct rt6_exception_bucket *bucket;
1673         struct rt6_exception *rt6_ex;
1674         int i;
1675
1676         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677                                         lockdep_is_held(&rt6_exception_lock));
1678
1679         if (!bucket)
1680                 return;
1681
1682         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684                         struct rt6_info *entry = rt6_ex->rt6i;
1685
1686                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687                          * route), the metrics of its rt->from have already
1688                          * been updated.
1689                          */
1690                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1692                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693                 }
1694                 bucket++;
1695         }
1696 }
1697
1698 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1699
1700 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1701                                         struct in6_addr *gateway)
1702 {
1703         struct rt6_exception_bucket *bucket;
1704         struct rt6_exception *rt6_ex;
1705         struct hlist_node *tmp;
1706         int i;
1707
1708         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1709                 return;
1710
1711         spin_lock_bh(&rt6_exception_lock);
1712         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713                                      lockdep_is_held(&rt6_exception_lock));
1714
1715         if (bucket) {
1716                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717                         hlist_for_each_entry_safe(rt6_ex, tmp,
1718                                                   &bucket->chain, hlist) {
1719                                 struct rt6_info *entry = rt6_ex->rt6i;
1720
1721                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722                                     RTF_CACHE_GATEWAY &&
1723                                     ipv6_addr_equal(gateway,
1724                                                     &entry->rt6i_gateway)) {
1725                                         rt6_remove_exception(bucket, rt6_ex);
1726                                 }
1727                         }
1728                         bucket++;
1729                 }
1730         }
1731
1732         spin_unlock_bh(&rt6_exception_lock);
1733 }
1734
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736                                       struct rt6_exception *rt6_ex,
1737                                       struct fib6_gc_args *gc_args,
1738                                       unsigned long now)
1739 {
1740         struct rt6_info *rt = rt6_ex->rt6i;
1741
1742         /* we are pruning and obsoleting aged-out and non gateway exceptions
1743          * even if others have still references to them, so that on next
1744          * dst_check() such references can be dropped.
1745          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746          * expired, independently from their aging, as per RFC 8201 section 4
1747          */
1748         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750                         RT6_TRACE("aging clone %p\n", rt);
1751                         rt6_remove_exception(bucket, rt6_ex);
1752                         return;
1753                 }
1754         } else if (time_after(jiffies, rt->dst.expires)) {
1755                 RT6_TRACE("purging expired route %p\n", rt);
1756                 rt6_remove_exception(bucket, rt6_ex);
1757                 return;
1758         }
1759
1760         if (rt->rt6i_flags & RTF_GATEWAY) {
1761                 struct neighbour *neigh;
1762                 __u8 neigh_flags = 0;
1763
1764                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765                 if (neigh)
1766                         neigh_flags = neigh->flags;
1767
1768                 if (!(neigh_flags & NTF_ROUTER)) {
1769                         RT6_TRACE("purging route %p via non-router but gateway\n",
1770                                   rt);
1771                         rt6_remove_exception(bucket, rt6_ex);
1772                         return;
1773                 }
1774         }
1775
1776         gc_args->more++;
1777 }
1778
1779 void rt6_age_exceptions(struct fib6_info *rt,
1780                         struct fib6_gc_args *gc_args,
1781                         unsigned long now)
1782 {
1783         struct rt6_exception_bucket *bucket;
1784         struct rt6_exception *rt6_ex;
1785         struct hlist_node *tmp;
1786         int i;
1787
1788         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789                 return;
1790
1791         rcu_read_lock_bh();
1792         spin_lock(&rt6_exception_lock);
1793         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794                                     lockdep_is_held(&rt6_exception_lock));
1795
1796         if (bucket) {
1797                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798                         hlist_for_each_entry_safe(rt6_ex, tmp,
1799                                                   &bucket->chain, hlist) {
1800                                 rt6_age_examine_exception(bucket, rt6_ex,
1801                                                           gc_args, now);
1802                         }
1803                         bucket++;
1804                 }
1805         }
1806         spin_unlock(&rt6_exception_lock);
1807         rcu_read_unlock_bh();
1808 }
1809
1810 /* must be called with rcu lock held */
1811 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1812                                     int oif, struct flowi6 *fl6, int strict)
1813 {
1814         struct fib6_node *fn, *saved_fn;
1815         struct fib6_info *f6i;
1816
1817         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1818         saved_fn = fn;
1819
1820         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821                 oif = 0;
1822
1823 redo_rt6_select:
1824         f6i = rt6_select(net, fn, oif, strict);
1825         if (f6i == net->ipv6.fib6_null_entry) {
1826                 fn = fib6_backtrack(fn, &fl6->saddr);
1827                 if (fn)
1828                         goto redo_rt6_select;
1829                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1830                         /* also consider unreachable route */
1831                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1832                         fn = saved_fn;
1833                         goto redo_rt6_select;
1834                 }
1835         }
1836
1837         trace_fib6_table_lookup(net, f6i, table, fl6);
1838
1839         return f6i;
1840 }
1841
1842 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1843                                int oif, struct flowi6 *fl6,
1844                                const struct sk_buff *skb, int flags)
1845 {
1846         struct fib6_info *f6i;
1847         struct rt6_info *rt;
1848         int strict = 0;
1849
1850         strict |= flags & RT6_LOOKUP_F_IFACE;
1851         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1852         if (net->ipv6.devconf_all->forwarding == 0)
1853                 strict |= RT6_LOOKUP_F_REACHABLE;
1854
1855         rcu_read_lock();
1856
1857         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1858         if (f6i == net->ipv6.fib6_null_entry) {
1859                 rt = net->ipv6.ip6_null_entry;
1860                 rcu_read_unlock();
1861                 dst_hold(&rt->dst);
1862                 return rt;
1863         }
1864
1865         if (f6i->fib6_nsiblings)
1866                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1867
1868         /*Search through exception table */
1869         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870         if (rt) {
1871                 if (ip6_hold_safe(net, &rt))
1872                         dst_use_noref(&rt->dst, jiffies);
1873
1874                 rcu_read_unlock();
1875                 return rt;
1876         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1877                             !f6i->fib6_nh.fib_nh_gw_family)) {
1878                 /* Create a RTF_CACHE clone which will not be
1879                  * owned by the fib6 tree.  It is for the special case where
1880                  * the daddr in the skb during the neighbor look-up is different
1881                  * from the fl6->daddr used to look-up route here.
1882                  */
1883                 struct rt6_info *uncached_rt;
1884
1885                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886
1887                 rcu_read_unlock();
1888
1889                 if (uncached_rt) {
1890                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1891                          * No need for another dst_hold()
1892                          */
1893                         rt6_uncached_list_add(uncached_rt);
1894                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895                 } else {
1896                         uncached_rt = net->ipv6.ip6_null_entry;
1897                         dst_hold(&uncached_rt->dst);
1898                 }
1899
1900                 return uncached_rt;
1901         } else {
1902                 /* Get a percpu copy */
1903
1904                 struct rt6_info *pcpu_rt;
1905
1906                 local_bh_disable();
1907                 pcpu_rt = rt6_get_pcpu_route(f6i);
1908
1909                 if (!pcpu_rt)
1910                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1911
1912                 local_bh_enable();
1913                 rcu_read_unlock();
1914
1915                 return pcpu_rt;
1916         }
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919
1920 static struct rt6_info *ip6_pol_route_input(struct net *net,
1921                                             struct fib6_table *table,
1922                                             struct flowi6 *fl6,
1923                                             const struct sk_buff *skb,
1924                                             int flags)
1925 {
1926         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1927 }
1928
1929 struct dst_entry *ip6_route_input_lookup(struct net *net,
1930                                          struct net_device *dev,
1931                                          struct flowi6 *fl6,
1932                                          const struct sk_buff *skb,
1933                                          int flags)
1934 {
1935         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1936                 flags |= RT6_LOOKUP_F_IFACE;
1937
1938         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 }
1940 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941
1942 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1943                                   struct flow_keys *keys,
1944                                   struct flow_keys *flkeys)
1945 {
1946         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1947         const struct ipv6hdr *key_iph = outer_iph;
1948         struct flow_keys *_flkeys = flkeys;
1949         const struct ipv6hdr *inner_iph;
1950         const struct icmp6hdr *icmph;
1951         struct ipv6hdr _inner_iph;
1952         struct icmp6hdr _icmph;
1953
1954         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1955                 goto out;
1956
1957         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1958                                    sizeof(_icmph), &_icmph);
1959         if (!icmph)
1960                 goto out;
1961
1962         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1963             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1964             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1965             icmph->icmp6_type != ICMPV6_PARAMPROB)
1966                 goto out;
1967
1968         inner_iph = skb_header_pointer(skb,
1969                                        skb_transport_offset(skb) + sizeof(*icmph),
1970                                        sizeof(_inner_iph), &_inner_iph);
1971         if (!inner_iph)
1972                 goto out;
1973
1974         key_iph = inner_iph;
1975         _flkeys = NULL;
1976 out:
1977         if (_flkeys) {
1978                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1979                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1980                 keys->tags.flow_label = _flkeys->tags.flow_label;
1981                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982         } else {
1983                 keys->addrs.v6addrs.src = key_iph->saddr;
1984                 keys->addrs.v6addrs.dst = key_iph->daddr;
1985                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1986                 keys->basic.ip_proto = key_iph->nexthdr;
1987         }
1988 }
1989
1990 /* if skb is set it will be used and fl6 can be NULL */
1991 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1992                        const struct sk_buff *skb, struct flow_keys *flkeys)
1993 {
1994         struct flow_keys hash_keys;
1995         u32 mhash;
1996
1997         switch (ip6_multipath_hash_policy(net)) {
1998         case 0:
1999                 memset(&hash_keys, 0, sizeof(hash_keys));
2000                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001                 if (skb) {
2002                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003                 } else {
2004                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2005                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2006                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2007                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008                 }
2009                 break;
2010         case 1:
2011                 if (skb) {
2012                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2013                         struct flow_keys keys;
2014
2015                         /* short-circuit if we already have L4 hash present */
2016                         if (skb->l4_hash)
2017                                 return skb_get_hash_raw(skb) >> 1;
2018
2019                         memset(&hash_keys, 0, sizeof(hash_keys));
2020
2021                         if (!flkeys) {
2022                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2023                                 flkeys = &keys;
2024                         }
2025                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2027                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2028                         hash_keys.ports.src = flkeys->ports.src;
2029                         hash_keys.ports.dst = flkeys->ports.dst;
2030                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031                 } else {
2032                         memset(&hash_keys, 0, sizeof(hash_keys));
2033                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2035                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2036                         hash_keys.ports.src = fl6->fl6_sport;
2037                         hash_keys.ports.dst = fl6->fl6_dport;
2038                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039                 }
2040                 break;
2041         }
2042         mhash = flow_hash_from_keys(&hash_keys);
2043
2044         return mhash >> 1;
2045 }
2046
2047 void ip6_route_input(struct sk_buff *skb)
2048 {
2049         const struct ipv6hdr *iph = ipv6_hdr(skb);
2050         struct net *net = dev_net(skb->dev);
2051         int flags = RT6_LOOKUP_F_HAS_SADDR;
2052         struct ip_tunnel_info *tun_info;
2053         struct flowi6 fl6 = {
2054                 .flowi6_iif = skb->dev->ifindex,
2055                 .daddr = iph->daddr,
2056                 .saddr = iph->saddr,
2057                 .flowlabel = ip6_flowinfo(iph),
2058                 .flowi6_mark = skb->mark,
2059                 .flowi6_proto = iph->nexthdr,
2060         };
2061         struct flow_keys *flkeys = NULL, _flkeys;
2062
2063         tun_info = skb_tunnel_info(skb);
2064         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2065                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066
2067         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2068                 flkeys = &_flkeys;
2069
2070         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2071                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2072         skb_dst_drop(skb);
2073         skb_dst_set(skb,
2074                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2075 }
2076
2077 static struct rt6_info *ip6_pol_route_output(struct net *net,
2078                                              struct fib6_table *table,
2079                                              struct flowi6 *fl6,
2080                                              const struct sk_buff *skb,
2081                                              int flags)
2082 {
2083         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2084 }
2085
2086 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2087                                          struct flowi6 *fl6, int flags)
2088 {
2089         bool any_src;
2090
2091         if (ipv6_addr_type(&fl6->daddr) &
2092             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2093                 struct dst_entry *dst;
2094
2095                 dst = l3mdev_link_scope_lookup(net, fl6);
2096                 if (dst)
2097                         return dst;
2098         }
2099
2100         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2101
2102         any_src = ipv6_addr_any(&fl6->saddr);
2103         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2104             (fl6->flowi6_oif && any_src))
2105                 flags |= RT6_LOOKUP_F_IFACE;
2106
2107         if (!any_src)
2108                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2109         else if (sk)
2110                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2111
2112         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2113 }
2114 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2115
2116 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2117 {
2118         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2119         struct net_device *loopback_dev = net->loopback_dev;
2120         struct dst_entry *new = NULL;
2121
2122         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2123                        DST_OBSOLETE_DEAD, 0);
2124         if (rt) {
2125                 rt6_info_init(rt);
2126                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2127
2128                 new = &rt->dst;
2129                 new->__use = 1;
2130                 new->input = dst_discard;
2131                 new->output = dst_discard_out;
2132
2133                 dst_copy_metrics(new, &ort->dst);
2134
2135                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2136                 rt->rt6i_gateway = ort->rt6i_gateway;
2137                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2138
2139                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2140 #ifdef CONFIG_IPV6_SUBTREES
2141                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2142 #endif
2143         }
2144
2145         dst_release(dst_orig);
2146         return new ? new : ERR_PTR(-ENOMEM);
2147 }
2148
2149 /*
2150  *      Destination cache support functions
2151  */
2152
2153 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2154 {
2155         u32 rt_cookie = 0;
2156
2157         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158                 return false;
2159
2160         if (fib6_check_expired(f6i))
2161                 return false;
2162
2163         return true;
2164 }
2165
2166 static struct dst_entry *rt6_check(struct rt6_info *rt,
2167                                    struct fib6_info *from,
2168                                    u32 cookie)
2169 {
2170         u32 rt_cookie = 0;
2171
2172         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2173             rt_cookie != cookie)
2174                 return NULL;
2175
2176         if (rt6_check_expired(rt))
2177                 return NULL;
2178
2179         return &rt->dst;
2180 }
2181
2182 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2183                                             struct fib6_info *from,
2184                                             u32 cookie)
2185 {
2186         if (!__rt6_check_expired(rt) &&
2187             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2188             fib6_check(from, cookie))
2189                 return &rt->dst;
2190         else
2191                 return NULL;
2192 }
2193
2194 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2195 {
2196         struct dst_entry *dst_ret;
2197         struct fib6_info *from;
2198         struct rt6_info *rt;
2199
2200         rt = container_of(dst, struct rt6_info, dst);
2201
2202         rcu_read_lock();
2203
2204         /* All IPV6 dsts are created with ->obsolete set to the value
2205          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2206          * into this function always.
2207          */
2208
2209         from = rcu_dereference(rt->from);
2210
2211         if (from && (rt->rt6i_flags & RTF_PCPU ||
2212             unlikely(!list_empty(&rt->rt6i_uncached))))
2213                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2214         else
2215                 dst_ret = rt6_check(rt, from, cookie);
2216
2217         rcu_read_unlock();
2218
2219         return dst_ret;
2220 }
2221
2222 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2223 {
2224         struct rt6_info *rt = (struct rt6_info *) dst;
2225
2226         if (rt) {
2227                 if (rt->rt6i_flags & RTF_CACHE) {
2228                         rcu_read_lock();
2229                         if (rt6_check_expired(rt)) {
2230                                 rt6_remove_exception_rt(rt);
2231                                 dst = NULL;
2232                         }
2233                         rcu_read_unlock();
2234                 } else {
2235                         dst_release(dst);
2236                         dst = NULL;
2237                 }
2238         }
2239         return dst;
2240 }
2241
2242 static void ip6_link_failure(struct sk_buff *skb)
2243 {
2244         struct rt6_info *rt;
2245
2246         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2247
2248         rt = (struct rt6_info *) skb_dst(skb);
2249         if (rt) {
2250                 rcu_read_lock();
2251                 if (rt->rt6i_flags & RTF_CACHE) {
2252                         rt6_remove_exception_rt(rt);
2253                 } else {
2254                         struct fib6_info *from;
2255                         struct fib6_node *fn;
2256
2257                         from = rcu_dereference(rt->from);
2258                         if (from) {
2259                                 fn = rcu_dereference(from->fib6_node);
2260                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2261                                         fn->fn_sernum = -1;
2262                         }
2263                 }
2264                 rcu_read_unlock();
2265         }
2266 }
2267
2268 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 {
2270         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2271                 struct fib6_info *from;
2272
2273                 rcu_read_lock();
2274                 from = rcu_dereference(rt0->from);
2275                 if (from)
2276                         rt0->dst.expires = from->expires;
2277                 rcu_read_unlock();
2278         }
2279
2280         dst_set_expires(&rt0->dst, timeout);
2281         rt0->rt6i_flags |= RTF_EXPIRES;
2282 }
2283
2284 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 {
2286         struct net *net = dev_net(rt->dst.dev);
2287
2288         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2289         rt->rt6i_flags |= RTF_MODIFIED;
2290         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2291 }
2292
2293 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 {
2295         return !(rt->rt6i_flags & RTF_CACHE) &&
2296                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2297 }
2298
2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2300                                  const struct ipv6hdr *iph, u32 mtu)
2301 {
2302         const struct in6_addr *daddr, *saddr;
2303         struct rt6_info *rt6 = (struct rt6_info *)dst;
2304
2305         if (dst_metric_locked(dst, RTAX_MTU))
2306                 return;
2307
2308         if (iph) {
2309                 daddr = &iph->daddr;
2310                 saddr = &iph->saddr;
2311         } else if (sk) {
2312                 daddr = &sk->sk_v6_daddr;
2313                 saddr = &inet6_sk(sk)->saddr;
2314         } else {
2315                 daddr = NULL;
2316                 saddr = NULL;
2317         }
2318         dst_confirm_neigh(dst, daddr);
2319         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2320         if (mtu >= dst_mtu(dst))
2321                 return;
2322
2323         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2324                 rt6_do_update_pmtu(rt6, mtu);
2325                 /* update rt6_ex->stamp for cache */
2326                 if (rt6->rt6i_flags & RTF_CACHE)
2327                         rt6_update_exception_stamp_rt(rt6);
2328         } else if (daddr) {
2329                 struct fib6_info *from;
2330                 struct rt6_info *nrt6;
2331
2332                 rcu_read_lock();
2333                 from = rcu_dereference(rt6->from);
2334                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2335                 if (nrt6) {
2336                         rt6_do_update_pmtu(nrt6, mtu);
2337                         if (rt6_insert_exception(nrt6, from))
2338                                 dst_release_immediate(&nrt6->dst);
2339                 }
2340                 rcu_read_unlock();
2341         }
2342 }
2343
2344 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2345                                struct sk_buff *skb, u32 mtu)
2346 {
2347         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2348 }
2349
2350 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2351                      int oif, u32 mark, kuid_t uid)
2352 {
2353         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2354         struct dst_entry *dst;
2355         struct flowi6 fl6 = {
2356                 .flowi6_oif = oif,
2357                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2358                 .daddr = iph->daddr,
2359                 .saddr = iph->saddr,
2360                 .flowlabel = ip6_flowinfo(iph),
2361                 .flowi6_uid = uid,
2362         };
2363
2364         dst = ip6_route_output(net, NULL, &fl6);
2365         if (!dst->error)
2366                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2367         dst_release(dst);
2368 }
2369 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2370
2371 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2372 {
2373         int oif = sk->sk_bound_dev_if;
2374         struct dst_entry *dst;
2375
2376         if (!oif && skb->dev)
2377                 oif = l3mdev_master_ifindex(skb->dev);
2378
2379         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2380
2381         dst = __sk_dst_get(sk);
2382         if (!dst || !dst->obsolete ||
2383             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384                 return;
2385
2386         bh_lock_sock(sk);
2387         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2388                 ip6_datagram_dst_update(sk, false);
2389         bh_unlock_sock(sk);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392
2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2394                            const struct flowi6 *fl6)
2395 {
2396 #ifdef CONFIG_IPV6_SUBTREES
2397         struct ipv6_pinfo *np = inet6_sk(sk);
2398 #endif
2399
2400         ip6_dst_store(sk, dst,
2401                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2402                       &sk->sk_v6_daddr : NULL,
2403 #ifdef CONFIG_IPV6_SUBTREES
2404                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2405                       &np->saddr :
2406 #endif
2407                       NULL);
2408 }
2409
2410 static bool ip6_redirect_nh_match(struct fib6_info *f6i,
2411                                   struct fib6_nh *nh,
2412                                   struct flowi6 *fl6,
2413                                   const struct in6_addr *gw,
2414                                   struct rt6_info **ret)
2415 {
2416         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2417             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2418                 return false;
2419
2420         /* rt_cache's gateway might be different from its 'parent'
2421          * in the case of an ip redirect.
2422          * So we keep searching in the exception table if the gateway
2423          * is different.
2424          */
2425         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2426                 struct rt6_info *rt_cache;
2427
2428                 rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2429                 if (rt_cache &&
2430                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2431                         *ret = rt_cache;
2432                         return true;
2433                 }
2434                 return false;
2435         }
2436         return true;
2437 }
2438
2439 /* Handle redirects */
2440 struct ip6rd_flowi {
2441         struct flowi6 fl6;
2442         struct in6_addr gateway;
2443 };
2444
2445 static struct rt6_info *__ip6_route_redirect(struct net *net,
2446                                              struct fib6_table *table,
2447                                              struct flowi6 *fl6,
2448                                              const struct sk_buff *skb,
2449                                              int flags)
2450 {
2451         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2452         struct rt6_info *ret = NULL;
2453         struct fib6_info *rt;
2454         struct fib6_node *fn;
2455
2456         /* Get the "current" route for this destination and
2457          * check if the redirect has come from appropriate router.
2458          *
2459          * RFC 4861 specifies that redirects should only be
2460          * accepted if they come from the nexthop to the target.
2461          * Due to the way the routes are chosen, this notion
2462          * is a bit fuzzy and one might need to check all possible
2463          * routes.
2464          */
2465
2466         rcu_read_lock();
2467         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2468 restart:
2469         for_each_fib6_node_rt_rcu(fn) {
2470                 if (fib6_check_expired(rt))
2471                         continue;
2472                 if (rt->fib6_flags & RTF_REJECT)
2473                         break;
2474                 if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2475                         continue;
2476                 if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6,
2477                                           &rdfl->gateway, &ret))
2478                         goto out;
2479         }
2480
2481         if (!rt)
2482                 rt = net->ipv6.fib6_null_entry;
2483         else if (rt->fib6_flags & RTF_REJECT) {
2484                 ret = net->ipv6.ip6_null_entry;
2485                 goto out;
2486         }
2487
2488         if (rt == net->ipv6.fib6_null_entry) {
2489                 fn = fib6_backtrack(fn, &fl6->saddr);
2490                 if (fn)
2491                         goto restart;
2492         }
2493
2494 out:
2495         if (ret)
2496                 ip6_hold_safe(net, &ret);
2497         else
2498                 ret = ip6_create_rt_rcu(rt);
2499
2500         rcu_read_unlock();
2501
2502         trace_fib6_table_lookup(net, rt, table, fl6);
2503         return ret;
2504 };
2505
2506 static struct dst_entry *ip6_route_redirect(struct net *net,
2507                                             const struct flowi6 *fl6,
2508                                             const struct sk_buff *skb,
2509                                             const struct in6_addr *gateway)
2510 {
2511         int flags = RT6_LOOKUP_F_HAS_SADDR;
2512         struct ip6rd_flowi rdfl;
2513
2514         rdfl.fl6 = *fl6;
2515         rdfl.gateway = *gateway;
2516
2517         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2518                                 flags, __ip6_route_redirect);
2519 }
2520
2521 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2522                   kuid_t uid)
2523 {
2524         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2525         struct dst_entry *dst;
2526         struct flowi6 fl6 = {
2527                 .flowi6_iif = LOOPBACK_IFINDEX,
2528                 .flowi6_oif = oif,
2529                 .flowi6_mark = mark,
2530                 .daddr = iph->daddr,
2531                 .saddr = iph->saddr,
2532                 .flowlabel = ip6_flowinfo(iph),
2533                 .flowi6_uid = uid,
2534         };
2535
2536         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2537         rt6_do_redirect(dst, NULL, skb);
2538         dst_release(dst);
2539 }
2540 EXPORT_SYMBOL_GPL(ip6_redirect);
2541
2542 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2543 {
2544         const struct ipv6hdr *iph = ipv6_hdr(skb);
2545         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2546         struct dst_entry *dst;
2547         struct flowi6 fl6 = {
2548                 .flowi6_iif = LOOPBACK_IFINDEX,
2549                 .flowi6_oif = oif,
2550                 .daddr = msg->dest,
2551                 .saddr = iph->daddr,
2552                 .flowi6_uid = sock_net_uid(net, NULL),
2553         };
2554
2555         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2556         rt6_do_redirect(dst, NULL, skb);
2557         dst_release(dst);
2558 }
2559
2560 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2561 {
2562         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2563                      sk->sk_uid);
2564 }
2565 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2566
2567 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2568 {
2569         struct net_device *dev = dst->dev;
2570         unsigned int mtu = dst_mtu(dst);
2571         struct net *net = dev_net(dev);
2572
2573         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2574
2575         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2576                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2577
2578         /*
2579          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2580          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2581          * IPV6_MAXPLEN is also valid and means: "any MSS,
2582          * rely only on pmtu discovery"
2583          */
2584         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2585                 mtu = IPV6_MAXPLEN;
2586         return mtu;
2587 }
2588
2589 static unsigned int ip6_mtu(const struct dst_entry *dst)
2590 {
2591         struct inet6_dev *idev;
2592         unsigned int mtu;
2593
2594         mtu = dst_metric_raw(dst, RTAX_MTU);
2595         if (mtu)
2596                 goto out;
2597
2598         mtu = IPV6_MIN_MTU;
2599
2600         rcu_read_lock();
2601         idev = __in6_dev_get(dst->dev);
2602         if (idev)
2603                 mtu = idev->cnf.mtu6;
2604         rcu_read_unlock();
2605
2606 out:
2607         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2608
2609         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2610 }
2611
2612 /* MTU selection:
2613  * 1. mtu on route is locked - use it
2614  * 2. mtu from nexthop exception
2615  * 3. mtu from egress device
2616  *
2617  * based on ip6_dst_mtu_forward and exception logic of
2618  * rt6_find_cached_rt; called with rcu_read_lock
2619  */
2620 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2621                       struct in6_addr *saddr)
2622 {
2623         struct rt6_exception_bucket *bucket;
2624         struct rt6_exception *rt6_ex;
2625         struct in6_addr *src_key;
2626         struct inet6_dev *idev;
2627         u32 mtu = 0;
2628
2629         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2630                 mtu = f6i->fib6_pmtu;
2631                 if (mtu)
2632                         goto out;
2633         }
2634
2635         src_key = NULL;
2636 #ifdef CONFIG_IPV6_SUBTREES
2637         if (f6i->fib6_src.plen)
2638                 src_key = saddr;
2639 #endif
2640
2641         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2642         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2643         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2644                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2645
2646         if (likely(!mtu)) {
2647                 struct net_device *dev = fib6_info_nh_dev(f6i);
2648
2649                 mtu = IPV6_MIN_MTU;
2650                 idev = __in6_dev_get(dev);
2651                 if (idev && idev->cnf.mtu6 > mtu)
2652                         mtu = idev->cnf.mtu6;
2653         }
2654
2655         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2656 out:
2657         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2658 }
2659
2660 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2661                                   struct flowi6 *fl6)
2662 {
2663         struct dst_entry *dst;
2664         struct rt6_info *rt;
2665         struct inet6_dev *idev = in6_dev_get(dev);
2666         struct net *net = dev_net(dev);
2667
2668         if (unlikely(!idev))
2669                 return ERR_PTR(-ENODEV);
2670
2671         rt = ip6_dst_alloc(net, dev, 0);
2672         if (unlikely(!rt)) {
2673                 in6_dev_put(idev);
2674                 dst = ERR_PTR(-ENOMEM);
2675                 goto out;
2676         }
2677
2678         rt->dst.flags |= DST_HOST;
2679         rt->dst.input = ip6_input;
2680         rt->dst.output  = ip6_output;
2681         rt->rt6i_gateway  = fl6->daddr;
2682         rt->rt6i_dst.addr = fl6->daddr;
2683         rt->rt6i_dst.plen = 128;
2684         rt->rt6i_idev     = idev;
2685         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2686
2687         /* Add this dst into uncached_list so that rt6_disable_ip() can
2688          * do proper release of the net_device
2689          */
2690         rt6_uncached_list_add(rt);
2691         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2692
2693         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2694
2695 out:
2696         return dst;
2697 }
2698
2699 static int ip6_dst_gc(struct dst_ops *ops)
2700 {
2701         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2702         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2703         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2704         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2705         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2706         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2707         int entries;
2708
2709         entries = dst_entries_get_fast(ops);
2710         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2711             entries <= rt_max_size)
2712                 goto out;
2713
2714         net->ipv6.ip6_rt_gc_expire++;
2715         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2716         entries = dst_entries_get_slow(ops);
2717         if (entries < ops->gc_thresh)
2718                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2719 out:
2720         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2721         return entries > rt_max_size;
2722 }
2723
2724 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2725                                             struct fib6_config *cfg,
2726                                             const struct in6_addr *gw_addr,
2727                                             u32 tbid, int flags)
2728 {
2729         struct flowi6 fl6 = {
2730                 .flowi6_oif = cfg->fc_ifindex,
2731                 .daddr = *gw_addr,
2732                 .saddr = cfg->fc_prefsrc,
2733         };
2734         struct fib6_table *table;
2735         struct rt6_info *rt;
2736
2737         table = fib6_get_table(net, tbid);
2738         if (!table)
2739                 return NULL;
2740
2741         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2742                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2743
2744         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2745         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2746
2747         /* if table lookup failed, fall back to full lookup */
2748         if (rt == net->ipv6.ip6_null_entry) {
2749                 ip6_rt_put(rt);
2750                 rt = NULL;
2751         }
2752
2753         return rt;
2754 }
2755
2756 static int ip6_route_check_nh_onlink(struct net *net,
2757                                      struct fib6_config *cfg,
2758                                      const struct net_device *dev,
2759                                      struct netlink_ext_ack *extack)
2760 {
2761         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2762         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2763         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2764         struct fib6_info *from;
2765         struct rt6_info *grt;
2766         int err;
2767
2768         err = 0;
2769         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2770         if (grt) {
2771                 rcu_read_lock();
2772                 from = rcu_dereference(grt->from);
2773                 if (!grt->dst.error &&
2774                     /* ignore match if it is the default route */
2775                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2776                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2777                         NL_SET_ERR_MSG(extack,
2778                                        "Nexthop has invalid gateway or device mismatch");
2779                         err = -EINVAL;
2780                 }
2781                 rcu_read_unlock();
2782
2783                 ip6_rt_put(grt);
2784         }
2785
2786         return err;
2787 }
2788
2789 static int ip6_route_check_nh(struct net *net,
2790                               struct fib6_config *cfg,
2791                               struct net_device **_dev,
2792                               struct inet6_dev **idev)
2793 {
2794         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2795         struct net_device *dev = _dev ? *_dev : NULL;
2796         struct rt6_info *grt = NULL;
2797         int err = -EHOSTUNREACH;
2798
2799         if (cfg->fc_table) {
2800                 int flags = RT6_LOOKUP_F_IFACE;
2801
2802                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2803                                           cfg->fc_table, flags);
2804                 if (grt) {
2805                         if (grt->rt6i_flags & RTF_GATEWAY ||
2806                             (dev && dev != grt->dst.dev)) {
2807                                 ip6_rt_put(grt);
2808                                 grt = NULL;
2809                         }
2810                 }
2811         }
2812
2813         if (!grt)
2814                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2815
2816         if (!grt)
2817                 goto out;
2818
2819         if (dev) {
2820                 if (dev != grt->dst.dev) {
2821                         ip6_rt_put(grt);
2822                         goto out;
2823                 }
2824         } else {
2825                 *_dev = dev = grt->dst.dev;
2826                 *idev = grt->rt6i_idev;
2827                 dev_hold(dev);
2828                 in6_dev_hold(grt->rt6i_idev);
2829         }
2830
2831         if (!(grt->rt6i_flags & RTF_GATEWAY))
2832                 err = 0;
2833
2834         ip6_rt_put(grt);
2835
2836 out:
2837         return err;
2838 }
2839
2840 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2841                            struct net_device **_dev, struct inet6_dev **idev,
2842                            struct netlink_ext_ack *extack)
2843 {
2844         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2845         int gwa_type = ipv6_addr_type(gw_addr);
2846         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2847         const struct net_device *dev = *_dev;
2848         bool need_addr_check = !dev;
2849         int err = -EINVAL;
2850
2851         /* if gw_addr is local we will fail to detect this in case
2852          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2853          * will return already-added prefix route via interface that
2854          * prefix route was assigned to, which might be non-loopback.
2855          */
2856         if (dev &&
2857             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2858                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2859                 goto out;
2860         }
2861
2862         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2863                 /* IPv6 strictly inhibits using not link-local
2864                  * addresses as nexthop address.
2865                  * Otherwise, router will not able to send redirects.
2866                  * It is very good, but in some (rare!) circumstances
2867                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2868                  * some exceptions. --ANK
2869                  * We allow IPv4-mapped nexthops to support RFC4798-type
2870                  * addressing
2871                  */
2872                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2873                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2874                         goto out;
2875                 }
2876
2877                 if (cfg->fc_flags & RTNH_F_ONLINK)
2878                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2879                 else
2880                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2881
2882                 if (err)
2883                         goto out;
2884         }
2885
2886         /* reload in case device was changed */
2887         dev = *_dev;
2888
2889         err = -EINVAL;
2890         if (!dev) {
2891                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2892                 goto out;
2893         } else if (dev->flags & IFF_LOOPBACK) {
2894                 NL_SET_ERR_MSG(extack,
2895                                "Egress device can not be loopback device for this route");
2896                 goto out;
2897         }
2898
2899         /* if we did not check gw_addr above, do so now that the
2900          * egress device has been resolved.
2901          */
2902         if (need_addr_check &&
2903             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2904                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2905                 goto out;
2906         }
2907
2908         err = 0;
2909 out:
2910         return err;
2911 }
2912
2913 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2914 {
2915         if ((flags & RTF_REJECT) ||
2916             (dev && (dev->flags & IFF_LOOPBACK) &&
2917              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2918              !(flags & RTF_LOCAL)))
2919                 return true;
2920
2921         return false;
2922 }
2923
2924 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2925                  struct fib6_config *cfg, gfp_t gfp_flags,
2926                  struct netlink_ext_ack *extack)
2927 {
2928         struct net_device *dev = NULL;
2929         struct inet6_dev *idev = NULL;
2930         int addr_type;
2931         int err;
2932
2933         fib6_nh->fib_nh_family = AF_INET6;
2934
2935         err = -ENODEV;
2936         if (cfg->fc_ifindex) {
2937                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2938                 if (!dev)
2939                         goto out;
2940                 idev = in6_dev_get(dev);
2941                 if (!idev)
2942                         goto out;
2943         }
2944
2945         if (cfg->fc_flags & RTNH_F_ONLINK) {
2946                 if (!dev) {
2947                         NL_SET_ERR_MSG(extack,
2948                                        "Nexthop device required for onlink");
2949                         goto out;
2950                 }
2951
2952                 if (!(dev->flags & IFF_UP)) {
2953                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2954                         err = -ENETDOWN;
2955                         goto out;
2956                 }
2957
2958                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2959         }
2960
2961         fib6_nh->fib_nh_weight = 1;
2962
2963         /* We cannot add true routes via loopback here,
2964          * they would result in kernel looping; promote them to reject routes
2965          */
2966         addr_type = ipv6_addr_type(&cfg->fc_dst);
2967         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2968                 /* hold loopback dev/idev if we haven't done so. */
2969                 if (dev != net->loopback_dev) {
2970                         if (dev) {
2971                                 dev_put(dev);
2972                                 in6_dev_put(idev);
2973                         }
2974                         dev = net->loopback_dev;
2975                         dev_hold(dev);
2976                         idev = in6_dev_get(dev);
2977                         if (!idev) {
2978                                 err = -ENODEV;
2979                                 goto out;
2980                         }
2981                 }
2982                 goto set_dev;
2983         }
2984
2985         if (cfg->fc_flags & RTF_GATEWAY) {
2986                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2987                 if (err)
2988                         goto out;
2989
2990                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2991                 fib6_nh->fib_nh_gw_family = AF_INET6;
2992         }
2993
2994         err = -ENODEV;
2995         if (!dev)
2996                 goto out;
2997
2998         if (idev->cnf.disable_ipv6) {
2999                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3000                 err = -EACCES;
3001                 goto out;
3002         }
3003
3004         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3005                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3006                 err = -ENETDOWN;
3007                 goto out;
3008         }
3009
3010         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3011             !netif_carrier_ok(dev))
3012                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3013
3014         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3015                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3016         if (err)
3017                 goto out;
3018 set_dev:
3019         fib6_nh->fib_nh_dev = dev;
3020         fib6_nh->fib_nh_oif = dev->ifindex;
3021         err = 0;
3022 out:
3023         if (idev)
3024                 in6_dev_put(idev);
3025
3026         if (err) {
3027                 lwtstate_put(fib6_nh->fib_nh_lws);
3028                 fib6_nh->fib_nh_lws = NULL;
3029                 if (dev)
3030                         dev_put(dev);
3031         }
3032
3033         return err;
3034 }
3035
3036 void fib6_nh_release(struct fib6_nh *fib6_nh)
3037 {
3038         fib_nh_common_release(&fib6_nh->nh_common);
3039 }
3040
3041 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3042                                               gfp_t gfp_flags,
3043                                               struct netlink_ext_ack *extack)
3044 {
3045         struct net *net = cfg->fc_nlinfo.nl_net;
3046         struct fib6_info *rt = NULL;
3047         struct fib6_table *table;
3048         int err = -EINVAL;
3049         int addr_type;
3050
3051         /* RTF_PCPU is an internal flag; can not be set by userspace */
3052         if (cfg->fc_flags & RTF_PCPU) {
3053                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3054                 goto out;
3055         }
3056
3057         /* RTF_CACHE is an internal flag; can not be set by userspace */
3058         if (cfg->fc_flags & RTF_CACHE) {
3059                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3060                 goto out;
3061         }
3062
3063         if (cfg->fc_type > RTN_MAX) {
3064                 NL_SET_ERR_MSG(extack, "Invalid route type");
3065                 goto out;
3066         }
3067
3068         if (cfg->fc_dst_len > 128) {
3069                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3070                 goto out;
3071         }
3072         if (cfg->fc_src_len > 128) {
3073                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3074                 goto out;
3075         }
3076 #ifndef CONFIG_IPV6_SUBTREES
3077         if (cfg->fc_src_len) {
3078                 NL_SET_ERR_MSG(extack,
3079                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3080                 goto out;
3081         }
3082 #endif
3083
3084         err = -ENOBUFS;
3085         if (cfg->fc_nlinfo.nlh &&
3086             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3087                 table = fib6_get_table(net, cfg->fc_table);
3088                 if (!table) {
3089                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3090                         table = fib6_new_table(net, cfg->fc_table);
3091                 }
3092         } else {
3093                 table = fib6_new_table(net, cfg->fc_table);
3094         }
3095
3096         if (!table)
3097                 goto out;
3098
3099         err = -ENOMEM;
3100         rt = fib6_info_alloc(gfp_flags);
3101         if (!rt)
3102                 goto out;
3103
3104         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3105                                                extack);
3106         if (IS_ERR(rt->fib6_metrics)) {
3107                 err = PTR_ERR(rt->fib6_metrics);
3108                 /* Do not leave garbage there. */
3109                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3110                 goto out;
3111         }
3112
3113         if (cfg->fc_flags & RTF_ADDRCONF)
3114                 rt->dst_nocount = true;
3115
3116         if (cfg->fc_flags & RTF_EXPIRES)
3117                 fib6_set_expires(rt, jiffies +
3118                                 clock_t_to_jiffies(cfg->fc_expires));
3119         else
3120                 fib6_clean_expires(rt);
3121
3122         if (cfg->fc_protocol == RTPROT_UNSPEC)
3123                 cfg->fc_protocol = RTPROT_BOOT;
3124         rt->fib6_protocol = cfg->fc_protocol;
3125
3126         rt->fib6_table = table;
3127         rt->fib6_metric = cfg->fc_metric;
3128         rt->fib6_type = cfg->fc_type;
3129         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3130
3131         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3132         rt->fib6_dst.plen = cfg->fc_dst_len;
3133         if (rt->fib6_dst.plen == 128)
3134                 rt->dst_host = true;
3135
3136 #ifdef CONFIG_IPV6_SUBTREES
3137         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3138         rt->fib6_src.plen = cfg->fc_src_len;
3139 #endif
3140         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3141         if (err)
3142                 goto out;
3143
3144         /* We cannot add true routes via loopback here,
3145          * they would result in kernel looping; promote them to reject routes
3146          */
3147         addr_type = ipv6_addr_type(&cfg->fc_dst);
3148         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3149                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3150
3151         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3152                 struct net_device *dev = fib6_info_nh_dev(rt);
3153
3154                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3155                         NL_SET_ERR_MSG(extack, "Invalid source address");
3156                         err = -EINVAL;
3157                         goto out;
3158                 }
3159                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3160                 rt->fib6_prefsrc.plen = 128;
3161         } else
3162                 rt->fib6_prefsrc.plen = 0;
3163
3164         return rt;
3165 out:
3166         fib6_info_release(rt);
3167         return ERR_PTR(err);
3168 }
3169
3170 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3171                   struct netlink_ext_ack *extack)
3172 {
3173         struct fib6_info *rt;
3174         int err;
3175
3176         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3177         if (IS_ERR(rt))
3178                 return PTR_ERR(rt);
3179
3180         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3181         fib6_info_release(rt);
3182
3183         return err;
3184 }
3185
3186 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3187 {
3188         struct net *net = info->nl_net;
3189         struct fib6_table *table;
3190         int err;
3191
3192         if (rt == net->ipv6.fib6_null_entry) {
3193                 err = -ENOENT;
3194                 goto out;
3195         }
3196
3197         table = rt->fib6_table;
3198         spin_lock_bh(&table->tb6_lock);
3199         err = fib6_del(rt, info);
3200         spin_unlock_bh(&table->tb6_lock);
3201
3202 out:
3203         fib6_info_release(rt);
3204         return err;
3205 }
3206
3207 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3208 {
3209         struct nl_info info = { .nl_net = net };
3210
3211         return __ip6_del_rt(rt, &info);
3212 }
3213
3214 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3215 {
3216         struct nl_info *info = &cfg->fc_nlinfo;
3217         struct net *net = info->nl_net;
3218         struct sk_buff *skb = NULL;
3219         struct fib6_table *table;
3220         int err = -ENOENT;
3221
3222         if (rt == net->ipv6.fib6_null_entry)
3223                 goto out_put;
3224         table = rt->fib6_table;
3225         spin_lock_bh(&table->tb6_lock);
3226
3227         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3228                 struct fib6_info *sibling, *next_sibling;
3229
3230                 /* prefer to send a single notification with all hops */
3231                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3232                 if (skb) {
3233                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3234
3235                         if (rt6_fill_node(net, skb, rt, NULL,
3236                                           NULL, NULL, 0, RTM_DELROUTE,
3237                                           info->portid, seq, 0) < 0) {
3238                                 kfree_skb(skb);
3239                                 skb = NULL;
3240                         } else
3241                                 info->skip_notify = 1;
3242                 }
3243
3244                 list_for_each_entry_safe(sibling, next_sibling,
3245                                          &rt->fib6_siblings,
3246                                          fib6_siblings) {
3247                         err = fib6_del(sibling, info);
3248                         if (err)
3249                                 goto out_unlock;
3250                 }
3251         }
3252
3253         err = fib6_del(rt, info);
3254 out_unlock:
3255         spin_unlock_bh(&table->tb6_lock);
3256 out_put:
3257         fib6_info_release(rt);
3258
3259         if (skb) {
3260                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3261                             info->nlh, gfp_any());
3262         }
3263         return err;
3264 }
3265
3266 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3267 {
3268         int rc = -ESRCH;
3269
3270         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3271                 goto out;
3272
3273         if (cfg->fc_flags & RTF_GATEWAY &&
3274             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3275                 goto out;
3276
3277         rc = rt6_remove_exception_rt(rt);
3278 out:
3279         return rc;
3280 }
3281
3282 static int ip6_route_del(struct fib6_config *cfg,
3283                          struct netlink_ext_ack *extack)
3284 {
3285         struct rt6_info *rt_cache;
3286         struct fib6_table *table;
3287         struct fib6_info *rt;
3288         struct fib6_node *fn;
3289         int err = -ESRCH;
3290
3291         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3292         if (!table) {
3293                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3294                 return err;
3295         }
3296
3297         rcu_read_lock();
3298
3299         fn = fib6_locate(&table->tb6_root,
3300                          &cfg->fc_dst, cfg->fc_dst_len,
3301                          &cfg->fc_src, cfg->fc_src_len,
3302                          !(cfg->fc_flags & RTF_CACHE));
3303
3304         if (fn) {
3305                 for_each_fib6_node_rt_rcu(fn) {
3306                         struct fib6_nh *nh;
3307
3308                         if (cfg->fc_flags & RTF_CACHE) {
3309                                 int rc;
3310
3311                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3312                                                               &cfg->fc_src);
3313                                 if (rt_cache) {
3314                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3315                                         if (rc != -ESRCH) {
3316                                                 rcu_read_unlock();
3317                                                 return rc;
3318                                         }
3319                                 }
3320                                 continue;
3321                         }
3322
3323                         nh = &rt->fib6_nh;
3324                         if (cfg->fc_ifindex &&
3325                             (!nh->fib_nh_dev ||
3326                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3327                                 continue;
3328                         if (cfg->fc_flags & RTF_GATEWAY &&
3329                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3330                                 continue;
3331                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3332                                 continue;
3333                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3334                                 continue;
3335                         if (!fib6_info_hold_safe(rt))
3336                                 continue;
3337                         rcu_read_unlock();
3338
3339                         /* if gateway was specified only delete the one hop */
3340                         if (cfg->fc_flags & RTF_GATEWAY)
3341                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3342
3343                         return __ip6_del_rt_siblings(rt, cfg);
3344                 }
3345         }
3346         rcu_read_unlock();
3347
3348         return err;
3349 }
3350
3351 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3352 {
3353         struct netevent_redirect netevent;
3354         struct rt6_info *rt, *nrt = NULL;
3355         struct ndisc_options ndopts;
3356         struct inet6_dev *in6_dev;
3357         struct neighbour *neigh;
3358         struct fib6_info *from;
3359         struct rd_msg *msg;
3360         int optlen, on_link;
3361         u8 *lladdr;
3362
3363         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3364         optlen -= sizeof(*msg);
3365
3366         if (optlen < 0) {
3367                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3368                 return;
3369         }
3370
3371         msg = (struct rd_msg *)icmp6_hdr(skb);
3372
3373         if (ipv6_addr_is_multicast(&msg->dest)) {
3374                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3375                 return;
3376         }
3377
3378         on_link = 0;
3379         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3380                 on_link = 1;
3381         } else if (ipv6_addr_type(&msg->target) !=
3382                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3383                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3384                 return;
3385         }
3386
3387         in6_dev = __in6_dev_get(skb->dev);
3388         if (!in6_dev)
3389                 return;
3390         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3391                 return;
3392
3393         /* RFC2461 8.1:
3394          *      The IP source address of the Redirect MUST be the same as the current
3395          *      first-hop router for the specified ICMP Destination Address.
3396          */
3397
3398         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3399                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3400                 return;
3401         }
3402
3403         lladdr = NULL;
3404         if (ndopts.nd_opts_tgt_lladdr) {
3405                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3406                                              skb->dev);
3407                 if (!lladdr) {
3408                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3409                         return;
3410                 }
3411         }
3412
3413         rt = (struct rt6_info *) dst;
3414         if (rt->rt6i_flags & RTF_REJECT) {
3415                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3416                 return;
3417         }
3418
3419         /* Redirect received -> path was valid.
3420          * Look, redirects are sent only in response to data packets,
3421          * so that this nexthop apparently is reachable. --ANK
3422          */
3423         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3424
3425         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3426         if (!neigh)
3427                 return;
3428
3429         /*
3430          *      We have finally decided to accept it.
3431          */
3432
3433         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3434                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3435                      NEIGH_UPDATE_F_OVERRIDE|
3436                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3437                                      NEIGH_UPDATE_F_ISROUTER)),
3438                      NDISC_REDIRECT, &ndopts);
3439
3440         rcu_read_lock();
3441         from = rcu_dereference(rt->from);
3442         /* This fib6_info_hold() is safe here because we hold reference to rt
3443          * and rt already holds reference to fib6_info.
3444          */
3445         fib6_info_hold(from);
3446         rcu_read_unlock();
3447
3448         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3449         if (!nrt)
3450                 goto out;
3451
3452         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3453         if (on_link)
3454                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3455
3456         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3457
3458         /* No need to remove rt from the exception table if rt is
3459          * a cached route because rt6_insert_exception() will
3460          * takes care of it
3461          */
3462         if (rt6_insert_exception(nrt, from)) {
3463                 dst_release_immediate(&nrt->dst);
3464                 goto out;
3465         }
3466
3467         netevent.old = &rt->dst;
3468         netevent.new = &nrt->dst;
3469         netevent.daddr = &msg->dest;
3470         netevent.neigh = neigh;
3471         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3472
3473 out:
3474         fib6_info_release(from);
3475         neigh_release(neigh);
3476 }
3477
3478 #ifdef CONFIG_IPV6_ROUTE_INFO
3479 static struct fib6_info *rt6_get_route_info(struct net *net,
3480                                            const struct in6_addr *prefix, int prefixlen,
3481                                            const struct in6_addr *gwaddr,
3482                                            struct net_device *dev)
3483 {
3484         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3485         int ifindex = dev->ifindex;
3486         struct fib6_node *fn;
3487         struct fib6_info *rt = NULL;
3488         struct fib6_table *table;
3489
3490         table = fib6_get_table(net, tb_id);
3491         if (!table)
3492                 return NULL;
3493
3494         rcu_read_lock();
3495         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3496         if (!fn)
3497                 goto out;
3498
3499         for_each_fib6_node_rt_rcu(fn) {
3500                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3501                         continue;
3502                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3503                     !rt->fib6_nh.fib_nh_gw_family)
3504                         continue;
3505                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3506                         continue;
3507                 if (!fib6_info_hold_safe(rt))
3508                         continue;
3509                 break;
3510         }
3511 out:
3512         rcu_read_unlock();
3513         return rt;
3514 }
3515
3516 static struct fib6_info *rt6_add_route_info(struct net *net,
3517                                            const struct in6_addr *prefix, int prefixlen,
3518                                            const struct in6_addr *gwaddr,
3519                                            struct net_device *dev,
3520                                            unsigned int pref)
3521 {
3522         struct fib6_config cfg = {
3523                 .fc_metric      = IP6_RT_PRIO_USER,
3524                 .fc_ifindex     = dev->ifindex,
3525                 .fc_dst_len     = prefixlen,
3526                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3527                                   RTF_UP | RTF_PREF(pref),
3528                 .fc_protocol = RTPROT_RA,
3529                 .fc_type = RTN_UNICAST,
3530                 .fc_nlinfo.portid = 0,
3531                 .fc_nlinfo.nlh = NULL,
3532                 .fc_nlinfo.nl_net = net,
3533         };
3534
3535         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3536         cfg.fc_dst = *prefix;
3537         cfg.fc_gateway = *gwaddr;
3538
3539         /* We should treat it as a default route if prefix length is 0. */
3540         if (!prefixlen)
3541                 cfg.fc_flags |= RTF_DEFAULT;
3542
3543         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3544
3545         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3546 }
3547 #endif
3548
3549 struct fib6_info *rt6_get_dflt_router(struct net *net,
3550                                      const struct in6_addr *addr,
3551                                      struct net_device *dev)
3552 {
3553         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3554         struct fib6_info *rt;
3555         struct fib6_table *table;
3556
3557         table = fib6_get_table(net, tb_id);
3558         if (!table)
3559                 return NULL;
3560
3561         rcu_read_lock();
3562         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3563                 struct fib6_nh *nh = &rt->fib6_nh;
3564
3565                 if (dev == nh->fib_nh_dev &&
3566                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3567                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3568                         break;
3569         }
3570         if (rt && !fib6_info_hold_safe(rt))
3571                 rt = NULL;
3572         rcu_read_unlock();
3573         return rt;
3574 }
3575
3576 struct fib6_info *rt6_add_dflt_router(struct net *net,
3577                                      const struct in6_addr *gwaddr,
3578                                      struct net_device *dev,
3579                                      unsigned int pref)
3580 {
3581         struct fib6_config cfg = {
3582                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3583                 .fc_metric      = IP6_RT_PRIO_USER,
3584                 .fc_ifindex     = dev->ifindex,
3585                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3586                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3587                 .fc_protocol = RTPROT_RA,
3588                 .fc_type = RTN_UNICAST,
3589                 .fc_nlinfo.portid = 0,
3590                 .fc_nlinfo.nlh = NULL,
3591                 .fc_nlinfo.nl_net = net,
3592         };
3593
3594         cfg.fc_gateway = *gwaddr;
3595
3596         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3597                 struct fib6_table *table;
3598
3599                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3600                 if (table)
3601                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3602         }
3603
3604         return rt6_get_dflt_router(net, gwaddr, dev);
3605 }
3606
3607 static void __rt6_purge_dflt_routers(struct net *net,
3608                                      struct fib6_table *table)
3609 {
3610         struct fib6_info *rt;
3611
3612 restart:
3613         rcu_read_lock();
3614         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3615                 struct net_device *dev = fib6_info_nh_dev(rt);
3616                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3617
3618                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3619                     (!idev || idev->cnf.accept_ra != 2) &&
3620                     fib6_info_hold_safe(rt)) {
3621                         rcu_read_unlock();
3622                         ip6_del_rt(net, rt);
3623                         goto restart;
3624                 }
3625         }
3626         rcu_read_unlock();
3627
3628         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3629 }
3630
3631 void rt6_purge_dflt_routers(struct net *net)
3632 {
3633         struct fib6_table *table;
3634         struct hlist_head *head;
3635         unsigned int h;
3636
3637         rcu_read_lock();
3638
3639         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3640                 head = &net->ipv6.fib_table_hash[h];
3641                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3642                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3643                                 __rt6_purge_dflt_routers(net, table);
3644                 }
3645         }
3646
3647         rcu_read_unlock();
3648 }
3649
3650 static void rtmsg_to_fib6_config(struct net *net,
3651                                  struct in6_rtmsg *rtmsg,
3652                                  struct fib6_config *cfg)
3653 {
3654         *cfg = (struct fib6_config){
3655                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3656                          : RT6_TABLE_MAIN,
3657                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3658                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3659                 .fc_expires = rtmsg->rtmsg_info,
3660                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3661                 .fc_src_len = rtmsg->rtmsg_src_len,
3662                 .fc_flags = rtmsg->rtmsg_flags,
3663                 .fc_type = rtmsg->rtmsg_type,
3664
3665                 .fc_nlinfo.nl_net = net,
3666
3667                 .fc_dst = rtmsg->rtmsg_dst,
3668                 .fc_src = rtmsg->rtmsg_src,
3669                 .fc_gateway = rtmsg->rtmsg_gateway,
3670         };
3671 }
3672
3673 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3674 {
3675         struct fib6_config cfg;
3676         struct in6_rtmsg rtmsg;
3677         int err;
3678
3679         switch (cmd) {
3680         case SIOCADDRT:         /* Add a route */
3681         case SIOCDELRT:         /* Delete a route */
3682                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3683                         return -EPERM;
3684                 err = copy_from_user(&rtmsg, arg,
3685                                      sizeof(struct in6_rtmsg));
3686                 if (err)
3687                         return -EFAULT;
3688
3689                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3690
3691                 rtnl_lock();
3692                 switch (cmd) {
3693                 case SIOCADDRT:
3694                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3695                         break;
3696                 case SIOCDELRT:
3697                         err = ip6_route_del(&cfg, NULL);
3698                         break;
3699                 default:
3700                         err = -EINVAL;
3701                 }
3702                 rtnl_unlock();
3703
3704                 return err;
3705         }
3706
3707         return -EINVAL;
3708 }
3709
3710 /*
3711  *      Drop the packet on the floor
3712  */
3713
3714 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3715 {
3716         int type;
3717         struct dst_entry *dst = skb_dst(skb);
3718         switch (ipstats_mib_noroutes) {
3719         case IPSTATS_MIB_INNOROUTES:
3720                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3721                 if (type == IPV6_ADDR_ANY) {
3722                         IP6_INC_STATS(dev_net(dst->dev),
3723                                       __in6_dev_get_safely(skb->dev),
3724                                       IPSTATS_MIB_INADDRERRORS);
3725                         break;
3726                 }
3727                 /* FALLTHROUGH */
3728         case IPSTATS_MIB_OUTNOROUTES:
3729                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3730                               ipstats_mib_noroutes);
3731                 break;
3732         }
3733         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3734         kfree_skb(skb);
3735         return 0;
3736 }
3737
3738 static int ip6_pkt_discard(struct sk_buff *skb)
3739 {
3740         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3741 }
3742
3743 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3744 {
3745         skb->dev = skb_dst(skb)->dev;
3746         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3747 }
3748
3749 static int ip6_pkt_prohibit(struct sk_buff *skb)
3750 {
3751         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3752 }
3753
3754 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3755 {
3756         skb->dev = skb_dst(skb)->dev;
3757         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3758 }
3759
3760 /*
3761  *      Allocate a dst for local (unicast / anycast) address.
3762  */
3763
3764 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3765                                      struct inet6_dev *idev,
3766                                      const struct in6_addr *addr,
3767                                      bool anycast, gfp_t gfp_flags)
3768 {
3769         struct fib6_config cfg = {
3770                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3771                 .fc_ifindex = idev->dev->ifindex,
3772                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3773                 .fc_dst = *addr,
3774                 .fc_dst_len = 128,
3775                 .fc_protocol = RTPROT_KERNEL,
3776                 .fc_nlinfo.nl_net = net,
3777                 .fc_ignore_dev_down = true,
3778         };
3779
3780         if (anycast) {
3781                 cfg.fc_type = RTN_ANYCAST;
3782                 cfg.fc_flags |= RTF_ANYCAST;
3783         } else {
3784                 cfg.fc_type = RTN_LOCAL;
3785                 cfg.fc_flags |= RTF_LOCAL;
3786         }
3787
3788         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3789 }
3790
3791 /* remove deleted ip from prefsrc entries */
3792 struct arg_dev_net_ip {
3793         struct net_device *dev;
3794         struct net *net;
3795         struct in6_addr *addr;
3796 };
3797
3798 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3799 {
3800         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3801         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3802         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3803
3804         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3805             rt != net->ipv6.fib6_null_entry &&
3806             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3807                 spin_lock_bh(&rt6_exception_lock);
3808                 /* remove prefsrc entry */
3809                 rt->fib6_prefsrc.plen = 0;
3810                 spin_unlock_bh(&rt6_exception_lock);
3811         }
3812         return 0;
3813 }
3814
3815 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3816 {
3817         struct net *net = dev_net(ifp->idev->dev);
3818         struct arg_dev_net_ip adni = {
3819                 .dev = ifp->idev->dev,
3820                 .net = net,
3821                 .addr = &ifp->addr,
3822         };
3823         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3824 }
3825
3826 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3827
3828 /* Remove routers and update dst entries when gateway turn into host. */
3829 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3830 {
3831         struct in6_addr *gateway = (struct in6_addr *)arg;
3832
3833         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3834             rt->fib6_nh.fib_nh_gw_family &&
3835             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3836                 return -1;
3837         }
3838
3839         /* Further clean up cached routes in exception table.
3840          * This is needed because cached route may have a different
3841          * gateway than its 'parent' in the case of an ip redirect.
3842          */
3843         rt6_exceptions_clean_tohost(rt, gateway);
3844
3845         return 0;
3846 }
3847
3848 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3849 {
3850         fib6_clean_all(net, fib6_clean_tohost, gateway);
3851 }
3852
3853 struct arg_netdev_event {
3854         const struct net_device *dev;
3855         union {
3856                 unsigned int nh_flags;
3857                 unsigned long event;
3858         };
3859 };
3860
3861 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3862 {
3863         struct fib6_info *iter;
3864         struct fib6_node *fn;
3865
3866         fn = rcu_dereference_protected(rt->fib6_node,
3867                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3868         iter = rcu_dereference_protected(fn->leaf,
3869                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3870         while (iter) {
3871                 if (iter->fib6_metric == rt->fib6_metric &&
3872                     rt6_qualify_for_ecmp(iter))
3873                         return iter;
3874                 iter = rcu_dereference_protected(iter->fib6_next,
3875                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3876         }
3877
3878         return NULL;
3879 }
3880
3881 static bool rt6_is_dead(const struct fib6_info *rt)
3882 {
3883         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3884             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3885              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3886                 return true;
3887
3888         return false;
3889 }
3890
3891 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3892 {
3893         struct fib6_info *iter;
3894         int total = 0;
3895
3896         if (!rt6_is_dead(rt))
3897                 total += rt->fib6_nh.fib_nh_weight;
3898
3899         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3900                 if (!rt6_is_dead(iter))
3901                         total += iter->fib6_nh.fib_nh_weight;
3902         }
3903
3904         return total;
3905 }
3906
3907 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3908 {
3909         int upper_bound = -1;
3910
3911         if (!rt6_is_dead(rt)) {
3912                 *weight += rt->fib6_nh.fib_nh_weight;
3913                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3914                                                     total) - 1;
3915         }
3916         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3917 }
3918
3919 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3920 {
3921         struct fib6_info *iter;
3922         int weight = 0;
3923
3924         rt6_upper_bound_set(rt, &weight, total);
3925
3926         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3927                 rt6_upper_bound_set(iter, &weight, total);
3928 }
3929
3930 void rt6_multipath_rebalance(struct fib6_info *rt)
3931 {
3932         struct fib6_info *first;
3933         int total;
3934
3935         /* In case the entire multipath route was marked for flushing,
3936          * then there is no need to rebalance upon the removal of every
3937          * sibling route.
3938          */
3939         if (!rt->fib6_nsiblings || rt->should_flush)
3940                 return;
3941
3942         /* During lookup routes are evaluated in order, so we need to
3943          * make sure upper bounds are assigned from the first sibling
3944          * onwards.
3945          */
3946         first = rt6_multipath_first_sibling(rt);
3947         if (WARN_ON_ONCE(!first))
3948                 return;
3949
3950         total = rt6_multipath_total_weight(first);
3951         rt6_multipath_upper_bound_set(first, total);
3952 }
3953
3954 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3955 {
3956         const struct arg_netdev_event *arg = p_arg;
3957         struct net *net = dev_net(arg->dev);
3958
3959         if (rt != net->ipv6.fib6_null_entry &&
3960             rt->fib6_nh.fib_nh_dev == arg->dev) {
3961                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3962                 fib6_update_sernum_upto_root(net, rt);
3963                 rt6_multipath_rebalance(rt);
3964         }
3965
3966         return 0;
3967 }
3968
3969 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3970 {
3971         struct arg_netdev_event arg = {
3972                 .dev = dev,
3973                 {
3974                         .nh_flags = nh_flags,
3975                 },
3976         };
3977
3978         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3979                 arg.nh_flags |= RTNH_F_LINKDOWN;
3980
3981         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3982 }
3983
3984 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3985                                    const struct net_device *dev)
3986 {
3987         struct fib6_info *iter;
3988
3989         if (rt->fib6_nh.fib_nh_dev == dev)
3990                 return true;
3991         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992                 if (iter->fib6_nh.fib_nh_dev == dev)
3993                         return true;
3994
3995         return false;
3996 }
3997
3998 static void rt6_multipath_flush(struct fib6_info *rt)
3999 {
4000         struct fib6_info *iter;
4001
4002         rt->should_flush = 1;
4003         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4004                 iter->should_flush = 1;
4005 }
4006
4007 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4008                                              const struct net_device *down_dev)
4009 {
4010         struct fib6_info *iter;
4011         unsigned int dead = 0;
4012
4013         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4014             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4015                 dead++;
4016         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4017                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4018                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4019                         dead++;
4020
4021         return dead;
4022 }
4023
4024 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4025                                        const struct net_device *dev,
4026                                        unsigned int nh_flags)
4027 {
4028         struct fib6_info *iter;
4029
4030         if (rt->fib6_nh.fib_nh_dev == dev)
4031                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4032         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4033                 if (iter->fib6_nh.fib_nh_dev == dev)
4034                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4035 }
4036
4037 /* called with write lock held for table with rt */
4038 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4039 {
4040         const struct arg_netdev_event *arg = p_arg;
4041         const struct net_device *dev = arg->dev;
4042         struct net *net = dev_net(dev);
4043
4044         if (rt == net->ipv6.fib6_null_entry)
4045                 return 0;
4046
4047         switch (arg->event) {
4048         case NETDEV_UNREGISTER:
4049                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4050         case NETDEV_DOWN:
4051                 if (rt->should_flush)
4052                         return -1;
4053                 if (!rt->fib6_nsiblings)
4054                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4055                 if (rt6_multipath_uses_dev(rt, dev)) {
4056                         unsigned int count;
4057
4058                         count = rt6_multipath_dead_count(rt, dev);
4059                         if (rt->fib6_nsiblings + 1 == count) {
4060                                 rt6_multipath_flush(rt);
4061                                 return -1;
4062                         }
4063                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4064                                                    RTNH_F_LINKDOWN);
4065                         fib6_update_sernum(net, rt);
4066                         rt6_multipath_rebalance(rt);
4067                 }
4068                 return -2;
4069         case NETDEV_CHANGE:
4070                 if (rt->fib6_nh.fib_nh_dev != dev ||
4071                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4072                         break;
4073                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4074                 rt6_multipath_rebalance(rt);
4075                 break;
4076         }
4077
4078         return 0;
4079 }
4080
4081 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4082 {
4083         struct arg_netdev_event arg = {
4084                 .dev = dev,
4085                 {
4086                         .event = event,
4087                 },
4088         };
4089         struct net *net = dev_net(dev);
4090
4091         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4092                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4093         else
4094                 fib6_clean_all(net, fib6_ifdown, &arg);
4095 }
4096
4097 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4098 {
4099         rt6_sync_down_dev(dev, event);
4100         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4101         neigh_ifdown(&nd_tbl, dev);
4102 }
4103
4104 struct rt6_mtu_change_arg {
4105         struct net_device *dev;
4106         unsigned int mtu;
4107 };
4108
4109 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4110 {
4111         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4112         struct inet6_dev *idev;
4113
4114         /* In IPv6 pmtu discovery is not optional,
4115            so that RTAX_MTU lock cannot disable it.
4116            We still use this lock to block changes
4117            caused by addrconf/ndisc.
4118         */
4119
4120         idev = __in6_dev_get(arg->dev);
4121         if (!idev)
4122                 return 0;
4123
4124         /* For administrative MTU increase, there is no way to discover
4125            IPv6 PMTU increase, so PMTU increase should be updated here.
4126            Since RFC 1981 doesn't include administrative MTU increase
4127            update PMTU increase is a MUST. (i.e. jumbo frame)
4128          */
4129         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4130             !fib6_metric_locked(rt, RTAX_MTU)) {
4131                 u32 mtu = rt->fib6_pmtu;
4132
4133                 if (mtu >= arg->mtu ||
4134                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4135                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4136
4137                 spin_lock_bh(&rt6_exception_lock);
4138                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4139                 spin_unlock_bh(&rt6_exception_lock);
4140         }
4141         return 0;
4142 }
4143
4144 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4145 {
4146         struct rt6_mtu_change_arg arg = {
4147                 .dev = dev,
4148                 .mtu = mtu,
4149         };
4150
4151         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4152 }
4153
4154 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4155         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4156         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4157         [RTA_OIF]               = { .type = NLA_U32 },
4158         [RTA_IIF]               = { .type = NLA_U32 },
4159         [RTA_PRIORITY]          = { .type = NLA_U32 },
4160         [RTA_METRICS]           = { .type = NLA_NESTED },
4161         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4162         [RTA_PREF]              = { .type = NLA_U8 },
4163         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4164         [RTA_ENCAP]             = { .type = NLA_NESTED },
4165         [RTA_EXPIRES]           = { .type = NLA_U32 },
4166         [RTA_UID]               = { .type = NLA_U32 },
4167         [RTA_MARK]              = { .type = NLA_U32 },
4168         [RTA_TABLE]             = { .type = NLA_U32 },
4169         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4170         [RTA_SPORT]             = { .type = NLA_U16 },
4171         [RTA_DPORT]             = { .type = NLA_U16 },
4172 };
4173
4174 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4175                               struct fib6_config *cfg,
4176                               struct netlink_ext_ack *extack)
4177 {
4178         struct rtmsg *rtm;
4179         struct nlattr *tb[RTA_MAX+1];
4180         unsigned int pref;
4181         int err;
4182
4183         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4184                           extack);
4185         if (err < 0)
4186                 goto errout;
4187
4188         err = -EINVAL;
4189         rtm = nlmsg_data(nlh);
4190
4191         *cfg = (struct fib6_config){
4192                 .fc_table = rtm->rtm_table,
4193                 .fc_dst_len = rtm->rtm_dst_len,
4194                 .fc_src_len = rtm->rtm_src_len,
4195                 .fc_flags = RTF_UP,
4196                 .fc_protocol = rtm->rtm_protocol,
4197                 .fc_type = rtm->rtm_type,
4198
4199                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4200                 .fc_nlinfo.nlh = nlh,
4201                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4202         };
4203
4204         if (rtm->rtm_type == RTN_UNREACHABLE ||
4205             rtm->rtm_type == RTN_BLACKHOLE ||
4206             rtm->rtm_type == RTN_PROHIBIT ||
4207             rtm->rtm_type == RTN_THROW)
4208                 cfg->fc_flags |= RTF_REJECT;
4209
4210         if (rtm->rtm_type == RTN_LOCAL)
4211                 cfg->fc_flags |= RTF_LOCAL;
4212
4213         if (rtm->rtm_flags & RTM_F_CLONED)
4214                 cfg->fc_flags |= RTF_CACHE;
4215
4216         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4217
4218         if (tb[RTA_GATEWAY]) {
4219                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4220                 cfg->fc_flags |= RTF_GATEWAY;
4221         }
4222         if (tb[RTA_VIA]) {
4223                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4224                 goto errout;
4225         }
4226
4227         if (tb[RTA_DST]) {
4228                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4229
4230                 if (nla_len(tb[RTA_DST]) < plen)
4231                         goto errout;
4232
4233                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4234         }
4235
4236         if (tb[RTA_SRC]) {
4237                 int plen = (rtm->rtm_src_len + 7) >> 3;
4238
4239                 if (nla_len(tb[RTA_SRC]) < plen)
4240                         goto errout;
4241
4242                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4243         }
4244
4245         if (tb[RTA_PREFSRC])
4246                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4247
4248         if (tb[RTA_OIF])
4249                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4250
4251         if (tb[RTA_PRIORITY])
4252                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4253
4254         if (tb[RTA_METRICS]) {
4255                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4256                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4257         }
4258
4259         if (tb[RTA_TABLE])
4260                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4261
4262         if (tb[RTA_MULTIPATH]) {
4263                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4264                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4265
4266                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4267                                                      cfg->fc_mp_len, extack);
4268                 if (err < 0)
4269                         goto errout;
4270         }
4271
4272         if (tb[RTA_PREF]) {
4273                 pref = nla_get_u8(tb[RTA_PREF]);
4274                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4275                     pref != ICMPV6_ROUTER_PREF_HIGH)
4276                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4277                 cfg->fc_flags |= RTF_PREF(pref);
4278         }
4279
4280         if (tb[RTA_ENCAP])
4281                 cfg->fc_encap = tb[RTA_ENCAP];
4282
4283         if (tb[RTA_ENCAP_TYPE]) {
4284                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4285
4286                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4287                 if (err < 0)
4288                         goto errout;
4289         }
4290
4291         if (tb[RTA_EXPIRES]) {
4292                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4293
4294                 if (addrconf_finite_timeout(timeout)) {
4295                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4296                         cfg->fc_flags |= RTF_EXPIRES;
4297                 }
4298         }
4299
4300         err = 0;
4301 errout:
4302         return err;
4303 }
4304
4305 struct rt6_nh {
4306         struct fib6_info *fib6_info;
4307         struct fib6_config r_cfg;
4308         struct list_head next;
4309 };
4310
4311 static int ip6_route_info_append(struct net *net,
4312                                  struct list_head *rt6_nh_list,
4313                                  struct fib6_info *rt,
4314                                  struct fib6_config *r_cfg)
4315 {
4316         struct rt6_nh *nh;
4317         int err = -EEXIST;
4318
4319         list_for_each_entry(nh, rt6_nh_list, next) {
4320                 /* check if fib6_info already exists */
4321                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4322                         return err;
4323         }
4324
4325         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4326         if (!nh)
4327                 return -ENOMEM;
4328         nh->fib6_info = rt;
4329         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4330         list_add_tail(&nh->next, rt6_nh_list);
4331
4332         return 0;
4333 }
4334
4335 static void ip6_route_mpath_notify(struct fib6_info *rt,
4336                                    struct fib6_info *rt_last,
4337                                    struct nl_info *info,
4338                                    __u16 nlflags)
4339 {
4340         /* if this is an APPEND route, then rt points to the first route
4341          * inserted and rt_last points to last route inserted. Userspace
4342          * wants a consistent dump of the route which starts at the first
4343          * nexthop. Since sibling routes are always added at the end of
4344          * the list, find the first sibling of the last route appended
4345          */
4346         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4347                 rt = list_first_entry(&rt_last->fib6_siblings,
4348                                       struct fib6_info,
4349                                       fib6_siblings);
4350         }
4351
4352         if (rt)
4353                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4354 }
4355
4356 static int ip6_route_multipath_add(struct fib6_config *cfg,
4357                                    struct netlink_ext_ack *extack)
4358 {
4359         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4360         struct nl_info *info = &cfg->fc_nlinfo;
4361         struct fib6_config r_cfg;
4362         struct rtnexthop *rtnh;
4363         struct fib6_info *rt;
4364         struct rt6_nh *err_nh;
4365         struct rt6_nh *nh, *nh_safe;
4366         __u16 nlflags;
4367         int remaining;
4368         int attrlen;
4369         int err = 1;
4370         int nhn = 0;
4371         int replace = (cfg->fc_nlinfo.nlh &&
4372                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4373         LIST_HEAD(rt6_nh_list);
4374
4375         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4376         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4377                 nlflags |= NLM_F_APPEND;
4378
4379         remaining = cfg->fc_mp_len;
4380         rtnh = (struct rtnexthop *)cfg->fc_mp;
4381
4382         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4383          * fib6_info structs per nexthop
4384          */
4385         while (rtnh_ok(rtnh, remaining)) {
4386                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4387                 if (rtnh->rtnh_ifindex)
4388                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4389
4390                 attrlen = rtnh_attrlen(rtnh);
4391                 if (attrlen > 0) {
4392                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4393
4394                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4395                         if (nla) {
4396                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4397                                 r_cfg.fc_flags |= RTF_GATEWAY;
4398                         }
4399                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4400                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4401                         if (nla)
4402                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4403                 }
4404
4405                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4406                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4407                 if (IS_ERR(rt)) {
4408                         err = PTR_ERR(rt);
4409                         rt = NULL;
4410                         goto cleanup;
4411                 }
4412                 if (!rt6_qualify_for_ecmp(rt)) {
4413                         err = -EINVAL;
4414                         NL_SET_ERR_MSG(extack,
4415                                        "Device only routes can not be added for IPv6 using the multipath API.");
4416                         fib6_info_release(rt);
4417                         goto cleanup;
4418                 }
4419
4420                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4421
4422                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4423                                             rt, &r_cfg);
4424                 if (err) {
4425                         fib6_info_release(rt);
4426                         goto cleanup;
4427                 }
4428
4429                 rtnh = rtnh_next(rtnh, &remaining);
4430         }
4431
4432         /* for add and replace send one notification with all nexthops.
4433          * Skip the notification in fib6_add_rt2node and send one with
4434          * the full route when done
4435          */
4436         info->skip_notify = 1;
4437
4438         err_nh = NULL;
4439         list_for_each_entry(nh, &rt6_nh_list, next) {
4440                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4441                 fib6_info_release(nh->fib6_info);
4442
4443                 if (!err) {
4444                         /* save reference to last route successfully inserted */
4445                         rt_last = nh->fib6_info;
4446
4447                         /* save reference to first route for notification */
4448                         if (!rt_notif)
4449                                 rt_notif = nh->fib6_info;
4450                 }
4451
4452                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4453                 nh->fib6_info = NULL;
4454                 if (err) {
4455                         if (replace && nhn)
4456                                 NL_SET_ERR_MSG_MOD(extack,
4457                                                    "multipath route replace failed (check consistency of installed routes)");
4458                         err_nh = nh;
4459                         goto add_errout;
4460                 }
4461
4462                 /* Because each route is added like a single route we remove
4463                  * these flags after the first nexthop: if there is a collision,
4464                  * we have already failed to add the first nexthop:
4465                  * fib6_add_rt2node() has rejected it; when replacing, old
4466                  * nexthops have been replaced by first new, the rest should
4467                  * be added to it.
4468                  */
4469                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4470                                                      NLM_F_REPLACE);
4471                 nhn++;
4472         }
4473
4474         /* success ... tell user about new route */
4475         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4476         goto cleanup;
4477
4478 add_errout:
4479         /* send notification for routes that were added so that
4480          * the delete notifications sent by ip6_route_del are
4481          * coherent
4482          */
4483         if (rt_notif)
4484                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4485
4486         /* Delete routes that were already added */
4487         list_for_each_entry(nh, &rt6_nh_list, next) {
4488                 if (err_nh == nh)
4489                         break;
4490                 ip6_route_del(&nh->r_cfg, extack);
4491         }
4492
4493 cleanup:
4494         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4495                 if (nh->fib6_info)
4496                         fib6_info_release(nh->fib6_info);
4497                 list_del(&nh->next);
4498                 kfree(nh);
4499         }
4500
4501         return err;
4502 }
4503
4504 static int ip6_route_multipath_del(struct fib6_config *cfg,
4505                                    struct netlink_ext_ack *extack)
4506 {
4507         struct fib6_config r_cfg;
4508         struct rtnexthop *rtnh;
4509         int remaining;
4510         int attrlen;
4511         int err = 1, last_err = 0;
4512
4513         remaining = cfg->fc_mp_len;
4514         rtnh = (struct rtnexthop *)cfg->fc_mp;
4515
4516         /* Parse a Multipath Entry */
4517         while (rtnh_ok(rtnh, remaining)) {
4518                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4519                 if (rtnh->rtnh_ifindex)
4520                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4521
4522                 attrlen = rtnh_attrlen(rtnh);
4523                 if (attrlen > 0) {
4524                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4525
4526                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4527                         if (nla) {
4528                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4529                                 r_cfg.fc_flags |= RTF_GATEWAY;
4530                         }
4531                 }
4532                 err = ip6_route_del(&r_cfg, extack);
4533                 if (err)
4534                         last_err = err;
4535
4536                 rtnh = rtnh_next(rtnh, &remaining);
4537         }
4538
4539         return last_err;
4540 }
4541
4542 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4543                               struct netlink_ext_ack *extack)
4544 {
4545         struct fib6_config cfg;
4546         int err;
4547
4548         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4549         if (err < 0)
4550                 return err;
4551
4552         if (cfg.fc_mp)
4553                 return ip6_route_multipath_del(&cfg, extack);
4554         else {
4555                 cfg.fc_delete_all_nh = 1;
4556                 return ip6_route_del(&cfg, extack);
4557         }
4558 }
4559
4560 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4561                               struct netlink_ext_ack *extack)
4562 {
4563         struct fib6_config cfg;
4564         int err;
4565
4566         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4567         if (err < 0)
4568                 return err;
4569
4570         if (cfg.fc_metric == 0)
4571                 cfg.fc_metric = IP6_RT_PRIO_USER;
4572
4573         if (cfg.fc_mp)
4574                 return ip6_route_multipath_add(&cfg, extack);
4575         else
4576                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4577 }
4578
4579 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4580 {
4581         int nexthop_len = 0;
4582
4583         if (rt->fib6_nsiblings) {
4584                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4585                             + NLA_ALIGN(sizeof(struct rtnexthop))
4586                             + nla_total_size(16) /* RTA_GATEWAY */
4587                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4588
4589                 nexthop_len *= rt->fib6_nsiblings;
4590         }
4591
4592         return NLMSG_ALIGN(sizeof(struct rtmsg))
4593                + nla_total_size(16) /* RTA_SRC */
4594                + nla_total_size(16) /* RTA_DST */
4595                + nla_total_size(16) /* RTA_GATEWAY */
4596                + nla_total_size(16) /* RTA_PREFSRC */
4597                + nla_total_size(4) /* RTA_TABLE */
4598                + nla_total_size(4) /* RTA_IIF */
4599                + nla_total_size(4) /* RTA_OIF */
4600                + nla_total_size(4) /* RTA_PRIORITY */
4601                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4602                + nla_total_size(sizeof(struct rta_cacheinfo))
4603                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4604                + nla_total_size(1) /* RTA_PREF */
4605                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4606                + nexthop_len;
4607 }
4608
4609 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4610                          struct fib6_info *rt, struct dst_entry *dst,
4611                          struct in6_addr *dest, struct in6_addr *src,
4612                          int iif, int type, u32 portid, u32 seq,
4613                          unsigned int flags)
4614 {
4615         struct rt6_info *rt6 = (struct rt6_info *)dst;
4616         struct rt6key *rt6_dst, *rt6_src;
4617         u32 *pmetrics, table, rt6_flags;
4618         struct nlmsghdr *nlh;
4619         struct rtmsg *rtm;
4620         long expires = 0;
4621
4622         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4623         if (!nlh)
4624                 return -EMSGSIZE;
4625
4626         if (rt6) {
4627                 rt6_dst = &rt6->rt6i_dst;
4628                 rt6_src = &rt6->rt6i_src;
4629                 rt6_flags = rt6->rt6i_flags;
4630         } else {
4631                 rt6_dst = &rt->fib6_dst;
4632                 rt6_src = &rt->fib6_src;
4633                 rt6_flags = rt->fib6_flags;
4634         }
4635
4636         rtm = nlmsg_data(nlh);
4637         rtm->rtm_family = AF_INET6;
4638         rtm->rtm_dst_len = rt6_dst->plen;
4639         rtm->rtm_src_len = rt6_src->plen;
4640         rtm->rtm_tos = 0;
4641         if (rt->fib6_table)
4642                 table = rt->fib6_table->tb6_id;
4643         else
4644                 table = RT6_TABLE_UNSPEC;
4645         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4646         if (nla_put_u32(skb, RTA_TABLE, table))
4647                 goto nla_put_failure;
4648
4649         rtm->rtm_type = rt->fib6_type;
4650         rtm->rtm_flags = 0;
4651         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4652         rtm->rtm_protocol = rt->fib6_protocol;
4653
4654         if (rt6_flags & RTF_CACHE)
4655                 rtm->rtm_flags |= RTM_F_CLONED;
4656
4657         if (dest) {
4658                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4659                         goto nla_put_failure;
4660                 rtm->rtm_dst_len = 128;
4661         } else if (rtm->rtm_dst_len)
4662                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4663                         goto nla_put_failure;
4664 #ifdef CONFIG_IPV6_SUBTREES
4665         if (src) {
4666                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4667                         goto nla_put_failure;
4668                 rtm->rtm_src_len = 128;
4669         } else if (rtm->rtm_src_len &&
4670                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4671                 goto nla_put_failure;
4672 #endif
4673         if (iif) {
4674 #ifdef CONFIG_IPV6_MROUTE
4675                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4676                         int err = ip6mr_get_route(net, skb, rtm, portid);
4677
4678                         if (err == 0)
4679                                 return 0;
4680                         if (err < 0)
4681                                 goto nla_put_failure;
4682                 } else
4683 #endif
4684                         if (nla_put_u32(skb, RTA_IIF, iif))
4685                                 goto nla_put_failure;
4686         } else if (dest) {
4687                 struct in6_addr saddr_buf;
4688                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4689                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4690                         goto nla_put_failure;
4691         }
4692
4693         if (rt->fib6_prefsrc.plen) {
4694                 struct in6_addr saddr_buf;
4695                 saddr_buf = rt->fib6_prefsrc.addr;
4696                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4697                         goto nla_put_failure;
4698         }
4699
4700         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4701         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4702                 goto nla_put_failure;
4703
4704         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4705                 goto nla_put_failure;
4706
4707         /* For multipath routes, walk the siblings list and add
4708          * each as a nexthop within RTA_MULTIPATH.
4709          */
4710         if (rt6) {
4711                 if (rt6_flags & RTF_GATEWAY &&
4712                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4713                         goto nla_put_failure;
4714
4715                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4716                         goto nla_put_failure;
4717         } else if (rt->fib6_nsiblings) {
4718                 struct fib6_info *sibling, *next_sibling;
4719                 struct nlattr *mp;
4720
4721                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4722                 if (!mp)
4723                         goto nla_put_failure;
4724
4725                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4726                                     rt->fib6_nh.fib_nh_weight) < 0)
4727                         goto nla_put_failure;
4728
4729                 list_for_each_entry_safe(sibling, next_sibling,
4730                                          &rt->fib6_siblings, fib6_siblings) {
4731                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4732                                             sibling->fib6_nh.fib_nh_weight) < 0)
4733                                 goto nla_put_failure;
4734                 }
4735
4736                 nla_nest_end(skb, mp);
4737         } else {
4738                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4739                                      &rtm->rtm_flags, false) < 0)
4740                         goto nla_put_failure;
4741         }
4742
4743         if (rt6_flags & RTF_EXPIRES) {
4744                 expires = dst ? dst->expires : rt->expires;
4745                 expires -= jiffies;
4746         }
4747
4748         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4749                 goto nla_put_failure;
4750
4751         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4752                 goto nla_put_failure;
4753
4754
4755         nlmsg_end(skb, nlh);
4756         return 0;
4757
4758 nla_put_failure:
4759         nlmsg_cancel(skb, nlh);
4760         return -EMSGSIZE;
4761 }
4762
4763 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4764                                const struct net_device *dev)
4765 {
4766         if (f6i->fib6_nh.fib_nh_dev == dev)
4767                 return true;
4768
4769         if (f6i->fib6_nsiblings) {
4770                 struct fib6_info *sibling, *next_sibling;
4771
4772                 list_for_each_entry_safe(sibling, next_sibling,
4773                                          &f6i->fib6_siblings, fib6_siblings) {
4774                         if (sibling->fib6_nh.fib_nh_dev == dev)
4775                                 return true;
4776                 }
4777         }
4778
4779         return false;
4780 }
4781
4782 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4783 {
4784         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4785         struct fib_dump_filter *filter = &arg->filter;
4786         unsigned int flags = NLM_F_MULTI;
4787         struct net *net = arg->net;
4788
4789         if (rt == net->ipv6.fib6_null_entry)
4790                 return 0;
4791
4792         if ((filter->flags & RTM_F_PREFIX) &&
4793             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4794                 /* success since this is not a prefix route */
4795                 return 1;
4796         }
4797         if (filter->filter_set) {
4798                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4799                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4800                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4801                         return 1;
4802                 }
4803                 flags |= NLM_F_DUMP_FILTERED;
4804         }
4805
4806         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4807                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4808                              arg->cb->nlh->nlmsg_seq, flags);
4809 }
4810
4811 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4812                                         const struct nlmsghdr *nlh,
4813                                         struct nlattr **tb,
4814                                         struct netlink_ext_ack *extack)
4815 {
4816         struct rtmsg *rtm;
4817         int i, err;
4818
4819         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4820                 NL_SET_ERR_MSG_MOD(extack,
4821                                    "Invalid header for get route request");
4822                 return -EINVAL;
4823         }
4824
4825         if (!netlink_strict_get_check(skb))
4826                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4827                                    rtm_ipv6_policy, extack);
4828
4829         rtm = nlmsg_data(nlh);
4830         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4831             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4832             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4833             rtm->rtm_type) {
4834                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4835                 return -EINVAL;
4836         }
4837         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4838                 NL_SET_ERR_MSG_MOD(extack,
4839                                    "Invalid flags for get route request");
4840                 return -EINVAL;
4841         }
4842
4843         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4844                                  rtm_ipv6_policy, extack);
4845         if (err)
4846                 return err;
4847
4848         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4849             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4850                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4851                 return -EINVAL;
4852         }
4853
4854         for (i = 0; i <= RTA_MAX; i++) {
4855                 if (!tb[i])
4856                         continue;
4857
4858                 switch (i) {
4859                 case RTA_SRC:
4860                 case RTA_DST:
4861                 case RTA_IIF:
4862                 case RTA_OIF:
4863                 case RTA_MARK:
4864                 case RTA_UID:
4865                 case RTA_SPORT:
4866                 case RTA_DPORT:
4867                 case RTA_IP_PROTO:
4868                         break;
4869                 default:
4870                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4871                         return -EINVAL;
4872                 }
4873         }
4874
4875         return 0;
4876 }
4877
4878 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4879                               struct netlink_ext_ack *extack)
4880 {
4881         struct net *net = sock_net(in_skb->sk);
4882         struct nlattr *tb[RTA_MAX+1];
4883         int err, iif = 0, oif = 0;
4884         struct fib6_info *from;
4885         struct dst_entry *dst;
4886         struct rt6_info *rt;
4887         struct sk_buff *skb;
4888         struct rtmsg *rtm;
4889         struct flowi6 fl6 = {};
4890         bool fibmatch;
4891
4892         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4893         if (err < 0)
4894                 goto errout;
4895
4896         err = -EINVAL;
4897         rtm = nlmsg_data(nlh);
4898         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4899         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4900
4901         if (tb[RTA_SRC]) {
4902                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4903                         goto errout;
4904
4905                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4906         }
4907
4908         if (tb[RTA_DST]) {
4909                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4910                         goto errout;
4911
4912                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4913         }
4914
4915         if (tb[RTA_IIF])
4916                 iif = nla_get_u32(tb[RTA_IIF]);
4917
4918         if (tb[RTA_OIF])
4919                 oif = nla_get_u32(tb[RTA_OIF]);
4920
4921         if (tb[RTA_MARK])
4922                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4923
4924         if (tb[RTA_UID])
4925                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4926                                            nla_get_u32(tb[RTA_UID]));
4927         else
4928                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4929
4930         if (tb[RTA_SPORT])
4931                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4932
4933         if (tb[RTA_DPORT])
4934                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4935
4936         if (tb[RTA_IP_PROTO]) {
4937                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4938                                                   &fl6.flowi6_proto, AF_INET6,
4939                                                   extack);
4940                 if (err)
4941                         goto errout;
4942         }
4943
4944         if (iif) {
4945                 struct net_device *dev;
4946                 int flags = 0;
4947
4948                 rcu_read_lock();
4949
4950                 dev = dev_get_by_index_rcu(net, iif);
4951                 if (!dev) {
4952                         rcu_read_unlock();
4953                         err = -ENODEV;
4954                         goto errout;
4955                 }
4956
4957                 fl6.flowi6_iif = iif;
4958
4959                 if (!ipv6_addr_any(&fl6.saddr))
4960                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4961
4962                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4963
4964                 rcu_read_unlock();
4965         } else {
4966                 fl6.flowi6_oif = oif;
4967
4968                 dst = ip6_route_output(net, NULL, &fl6);
4969         }
4970
4971
4972         rt = container_of(dst, struct rt6_info, dst);
4973         if (rt->dst.error) {
4974                 err = rt->dst.error;
4975                 ip6_rt_put(rt);
4976                 goto errout;
4977         }
4978
4979         if (rt == net->ipv6.ip6_null_entry) {
4980                 err = rt->dst.error;
4981                 ip6_rt_put(rt);
4982                 goto errout;
4983         }
4984
4985         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4986         if (!skb) {
4987                 ip6_rt_put(rt);
4988                 err = -ENOBUFS;
4989                 goto errout;
4990         }
4991
4992         skb_dst_set(skb, &rt->dst);
4993
4994         rcu_read_lock();
4995         from = rcu_dereference(rt->from);
4996
4997         if (fibmatch)
4998                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4999                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5000                                     nlh->nlmsg_seq, 0);
5001         else
5002                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5003                                     &fl6.saddr, iif, RTM_NEWROUTE,
5004                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5005                                     0);
5006         rcu_read_unlock();
5007
5008         if (err < 0) {
5009                 kfree_skb(skb);
5010                 goto errout;
5011         }
5012
5013         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5014 errout:
5015         return err;
5016 }
5017
5018 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5019                      unsigned int nlm_flags)
5020 {
5021         struct sk_buff *skb;
5022         struct net *net = info->nl_net;
5023         u32 seq;
5024         int err;
5025
5026         err = -ENOBUFS;
5027         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5028
5029         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5030         if (!skb)
5031                 goto errout;
5032
5033         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5034                             event, info->portid, seq, nlm_flags);
5035         if (err < 0) {
5036                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5037                 WARN_ON(err == -EMSGSIZE);
5038                 kfree_skb(skb);
5039                 goto errout;
5040         }
5041         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5042                     info->nlh, gfp_any());
5043         return;
5044 errout:
5045         if (err < 0)
5046                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5047 }
5048
5049 static int ip6_route_dev_notify(struct notifier_block *this,
5050                                 unsigned long event, void *ptr)
5051 {
5052         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5053         struct net *net = dev_net(dev);
5054
5055         if (!(dev->flags & IFF_LOOPBACK))
5056                 return NOTIFY_OK;
5057
5058         if (event == NETDEV_REGISTER) {
5059                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5060                 net->ipv6.ip6_null_entry->dst.dev = dev;
5061                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5062 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5063                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5064                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5065                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5066                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5067 #endif
5068          } else if (event == NETDEV_UNREGISTER &&
5069                     dev->reg_state != NETREG_UNREGISTERED) {
5070                 /* NETDEV_UNREGISTER could be fired for multiple times by
5071                  * netdev_wait_allrefs(). Make sure we only call this once.
5072                  */
5073                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5074 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5075                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5076                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5077 #endif
5078         }
5079
5080         return NOTIFY_OK;
5081 }
5082
5083 /*
5084  *      /proc
5085  */
5086
5087 #ifdef CONFIG_PROC_FS
5088 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5089 {
5090         struct net *net = (struct net *)seq->private;
5091         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5092                    net->ipv6.rt6_stats->fib_nodes,
5093                    net->ipv6.rt6_stats->fib_route_nodes,
5094                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5095                    net->ipv6.rt6_stats->fib_rt_entries,
5096                    net->ipv6.rt6_stats->fib_rt_cache,
5097                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5098                    net->ipv6.rt6_stats->fib_discarded_routes);
5099
5100         return 0;
5101 }
5102 #endif  /* CONFIG_PROC_FS */
5103
5104 #ifdef CONFIG_SYSCTL
5105
5106 static
5107 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5108                               void __user *buffer, size_t *lenp, loff_t *ppos)
5109 {
5110         struct net *net;
5111         int delay;
5112         int ret;
5113         if (!write)
5114                 return -EINVAL;
5115
5116         net = (struct net *)ctl->extra1;
5117         delay = net->ipv6.sysctl.flush_delay;
5118         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5119         if (ret)
5120                 return ret;
5121
5122         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5123         return 0;
5124 }
5125
5126 static int zero;
5127 static int one = 1;
5128
5129 static struct ctl_table ipv6_route_table_template[] = {
5130         {
5131                 .procname       =       "flush",
5132                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5133                 .maxlen         =       sizeof(int),
5134                 .mode           =       0200,
5135                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5136         },
5137         {
5138                 .procname       =       "gc_thresh",
5139                 .data           =       &ip6_dst_ops_template.gc_thresh,
5140                 .maxlen         =       sizeof(int),
5141                 .mode           =       0644,
5142                 .proc_handler   =       proc_dointvec,
5143         },
5144         {
5145                 .procname       =       "max_size",
5146                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5147                 .maxlen         =       sizeof(int),
5148                 .mode           =       0644,
5149                 .proc_handler   =       proc_dointvec,
5150         },
5151         {
5152                 .procname       =       "gc_min_interval",
5153                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5154                 .maxlen         =       sizeof(int),
5155                 .mode           =       0644,
5156                 .proc_handler   =       proc_dointvec_jiffies,
5157         },
5158         {
5159                 .procname       =       "gc_timeout",
5160                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5161                 .maxlen         =       sizeof(int),
5162                 .mode           =       0644,
5163                 .proc_handler   =       proc_dointvec_jiffies,
5164         },
5165         {
5166                 .procname       =       "gc_interval",
5167                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5168                 .maxlen         =       sizeof(int),
5169                 .mode           =       0644,
5170                 .proc_handler   =       proc_dointvec_jiffies,
5171         },
5172         {
5173                 .procname       =       "gc_elasticity",
5174                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5175                 .maxlen         =       sizeof(int),
5176                 .mode           =       0644,
5177                 .proc_handler   =       proc_dointvec,
5178         },
5179         {
5180                 .procname       =       "mtu_expires",
5181                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5182                 .maxlen         =       sizeof(int),
5183                 .mode           =       0644,
5184                 .proc_handler   =       proc_dointvec_jiffies,
5185         },
5186         {
5187                 .procname       =       "min_adv_mss",
5188                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5189                 .maxlen         =       sizeof(int),
5190                 .mode           =       0644,
5191                 .proc_handler   =       proc_dointvec,
5192         },
5193         {
5194                 .procname       =       "gc_min_interval_ms",
5195                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5196                 .maxlen         =       sizeof(int),
5197                 .mode           =       0644,
5198                 .proc_handler   =       proc_dointvec_ms_jiffies,
5199         },
5200         {
5201                 .procname       =       "skip_notify_on_dev_down",
5202                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5203                 .maxlen         =       sizeof(int),
5204                 .mode           =       0644,
5205                 .proc_handler   =       proc_dointvec,
5206                 .extra1         =       &zero,
5207                 .extra2         =       &one,
5208         },
5209         { }
5210 };
5211
5212 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5213 {
5214         struct ctl_table *table;
5215
5216         table = kmemdup(ipv6_route_table_template,
5217                         sizeof(ipv6_route_table_template),
5218                         GFP_KERNEL);
5219
5220         if (table) {
5221                 table[0].data = &net->ipv6.sysctl.flush_delay;
5222                 table[0].extra1 = net;
5223                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5224                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5225                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5226                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5227                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5228                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5229                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5230                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5231                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5232                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5233
5234                 /* Don't export sysctls to unprivileged users */
5235                 if (net->user_ns != &init_user_ns)
5236                         table[0].procname = NULL;
5237         }
5238
5239         return table;
5240 }
5241 #endif
5242
5243 static int __net_init ip6_route_net_init(struct net *net)
5244 {
5245         int ret = -ENOMEM;
5246
5247         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5248                sizeof(net->ipv6.ip6_dst_ops));
5249
5250         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5251                 goto out_ip6_dst_ops;
5252
5253         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5254                                             sizeof(*net->ipv6.fib6_null_entry),
5255                                             GFP_KERNEL);
5256         if (!net->ipv6.fib6_null_entry)
5257                 goto out_ip6_dst_entries;
5258
5259         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5260                                            sizeof(*net->ipv6.ip6_null_entry),
5261                                            GFP_KERNEL);
5262         if (!net->ipv6.ip6_null_entry)
5263                 goto out_fib6_null_entry;
5264         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5265         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5266                          ip6_template_metrics, true);
5267
5268 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5269         net->ipv6.fib6_has_custom_rules = false;
5270         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5271                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5272                                                GFP_KERNEL);
5273         if (!net->ipv6.ip6_prohibit_entry)
5274                 goto out_ip6_null_entry;
5275         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5276         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5277                          ip6_template_metrics, true);
5278
5279         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5280                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5281                                                GFP_KERNEL);
5282         if (!net->ipv6.ip6_blk_hole_entry)
5283                 goto out_ip6_prohibit_entry;
5284         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5285         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5286                          ip6_template_metrics, true);
5287 #endif
5288
5289         net->ipv6.sysctl.flush_delay = 0;
5290         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5291         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5292         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5293         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5294         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5295         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5296         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5297         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5298
5299         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5300
5301         ret = 0;
5302 out:
5303         return ret;
5304
5305 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5306 out_ip6_prohibit_entry:
5307         kfree(net->ipv6.ip6_prohibit_entry);
5308 out_ip6_null_entry:
5309         kfree(net->ipv6.ip6_null_entry);
5310 #endif
5311 out_fib6_null_entry:
5312         kfree(net->ipv6.fib6_null_entry);
5313 out_ip6_dst_entries:
5314         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5315 out_ip6_dst_ops:
5316         goto out;
5317 }
5318
5319 static void __net_exit ip6_route_net_exit(struct net *net)
5320 {
5321         kfree(net->ipv6.fib6_null_entry);
5322         kfree(net->ipv6.ip6_null_entry);
5323 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5324         kfree(net->ipv6.ip6_prohibit_entry);
5325         kfree(net->ipv6.ip6_blk_hole_entry);
5326 #endif
5327         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5328 }
5329
5330 static int __net_init ip6_route_net_init_late(struct net *net)
5331 {
5332 #ifdef CONFIG_PROC_FS
5333         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5334                         sizeof(struct ipv6_route_iter));
5335         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5336                         rt6_stats_seq_show, NULL);
5337 #endif
5338         return 0;
5339 }
5340
5341 static void __net_exit ip6_route_net_exit_late(struct net *net)
5342 {
5343 #ifdef CONFIG_PROC_FS
5344         remove_proc_entry("ipv6_route", net->proc_net);
5345         remove_proc_entry("rt6_stats", net->proc_net);
5346 #endif
5347 }
5348
5349 static struct pernet_operations ip6_route_net_ops = {
5350         .init = ip6_route_net_init,
5351         .exit = ip6_route_net_exit,
5352 };
5353
5354 static int __net_init ipv6_inetpeer_init(struct net *net)
5355 {
5356         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5357
5358         if (!bp)
5359                 return -ENOMEM;
5360         inet_peer_base_init(bp);
5361         net->ipv6.peers = bp;
5362         return 0;
5363 }
5364
5365 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5366 {
5367         struct inet_peer_base *bp = net->ipv6.peers;
5368
5369         net->ipv6.peers = NULL;
5370         inetpeer_invalidate_tree(bp);
5371         kfree(bp);
5372 }
5373
5374 static struct pernet_operations ipv6_inetpeer_ops = {
5375         .init   =       ipv6_inetpeer_init,
5376         .exit   =       ipv6_inetpeer_exit,
5377 };
5378
5379 static struct pernet_operations ip6_route_net_late_ops = {
5380         .init = ip6_route_net_init_late,
5381         .exit = ip6_route_net_exit_late,
5382 };
5383
5384 static struct notifier_block ip6_route_dev_notifier = {
5385         .notifier_call = ip6_route_dev_notify,
5386         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5387 };
5388
5389 void __init ip6_route_init_special_entries(void)
5390 {
5391         /* Registering of the loopback is done before this portion of code,
5392          * the loopback reference in rt6_info will not be taken, do it
5393          * manually for init_net */
5394         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5395         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5396         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5397   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5398         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5399         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5400         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5401         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5402   #endif
5403 }
5404
5405 int __init ip6_route_init(void)
5406 {
5407         int ret;
5408         int cpu;
5409
5410         ret = -ENOMEM;
5411         ip6_dst_ops_template.kmem_cachep =
5412                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5413                                   SLAB_HWCACHE_ALIGN, NULL);
5414         if (!ip6_dst_ops_template.kmem_cachep)
5415                 goto out;
5416
5417         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5418         if (ret)
5419                 goto out_kmem_cache;
5420
5421         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5422         if (ret)
5423                 goto out_dst_entries;
5424
5425         ret = register_pernet_subsys(&ip6_route_net_ops);
5426         if (ret)
5427                 goto out_register_inetpeer;
5428
5429         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5430
5431         ret = fib6_init();
5432         if (ret)
5433                 goto out_register_subsys;
5434
5435         ret = xfrm6_init();
5436         if (ret)
5437                 goto out_fib6_init;
5438
5439         ret = fib6_rules_init();
5440         if (ret)
5441                 goto xfrm6_init;
5442
5443         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5444         if (ret)
5445                 goto fib6_rules_init;
5446
5447         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5448                                    inet6_rtm_newroute, NULL, 0);
5449         if (ret < 0)
5450                 goto out_register_late_subsys;
5451
5452         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5453                                    inet6_rtm_delroute, NULL, 0);
5454         if (ret < 0)
5455                 goto out_register_late_subsys;
5456
5457         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5458                                    inet6_rtm_getroute, NULL,
5459                                    RTNL_FLAG_DOIT_UNLOCKED);
5460         if (ret < 0)
5461                 goto out_register_late_subsys;
5462
5463         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5464         if (ret)
5465                 goto out_register_late_subsys;
5466
5467         for_each_possible_cpu(cpu) {
5468                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5469
5470                 INIT_LIST_HEAD(&ul->head);
5471                 spin_lock_init(&ul->lock);
5472         }
5473
5474 out:
5475         return ret;
5476
5477 out_register_late_subsys:
5478         rtnl_unregister_all(PF_INET6);
5479         unregister_pernet_subsys(&ip6_route_net_late_ops);
5480 fib6_rules_init:
5481         fib6_rules_cleanup();
5482 xfrm6_init:
5483         xfrm6_fini();
5484 out_fib6_init:
5485         fib6_gc_cleanup();
5486 out_register_subsys:
5487         unregister_pernet_subsys(&ip6_route_net_ops);
5488 out_register_inetpeer:
5489         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5490 out_dst_entries:
5491         dst_entries_destroy(&ip6_dst_blackhole_ops);
5492 out_kmem_cache:
5493         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5494         goto out;
5495 }
5496
5497 void ip6_route_cleanup(void)
5498 {
5499         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5500         unregister_pernet_subsys(&ip6_route_net_late_ops);
5501         fib6_rules_cleanup();
5502         xfrm6_fini();
5503         fib6_gc_cleanup();
5504         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5505         unregister_pernet_subsys(&ip6_route_net_ops);
5506         dst_entries_destroy(&ip6_dst_blackhole_ops);
5507         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5508 }