]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
ASoC: don't use rtd->codec on fsl-asoc-card
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct dst_entry *from = dst->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         dst->from = NULL;
413         dst_release(from);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->dst.from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                        rt6_check_expired((struct rt6_info *)rt->dst.from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 struct inet6_dev *idev = sibling->rt6i_idev;
476
477                                 if (!netif_carrier_ok(sibling->dst.dev) &&
478                                     idev->cnf.ignore_routes_with_linkdown)
479                                         break;
480                                 if (rt6_score_route(sibling, oif, strict) < 0)
481                                         break;
482                                 match = sibling;
483                                 break;
484                         }
485                 }
486         return match;
487 }
488
489 /*
490  *      Route lookup. rcu_read_lock() should be held.
491  */
492
493 static inline struct rt6_info *rt6_device_match(struct net *net,
494                                                     struct rt6_info *rt,
495                                                     const struct in6_addr *saddr,
496                                                     int oif,
497                                                     int flags)
498 {
499         struct rt6_info *local = NULL;
500         struct rt6_info *sprt;
501
502         if (!oif && ipv6_addr_any(saddr))
503                 goto out;
504
505         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
506                 struct net_device *dev = sprt->dst.dev;
507
508                 if (oif) {
509                         if (dev->ifindex == oif)
510                                 return sprt;
511                         if (dev->flags & IFF_LOOPBACK) {
512                                 if (!sprt->rt6i_idev ||
513                                     sprt->rt6i_idev->dev->ifindex != oif) {
514                                         if (flags & RT6_LOOKUP_F_IFACE)
515                                                 continue;
516                                         if (local &&
517                                             local->rt6i_idev->dev->ifindex == oif)
518                                                 continue;
519                                 }
520                                 local = sprt;
521                         }
522                 } else {
523                         if (ipv6_chk_addr(net, saddr, dev,
524                                           flags & RT6_LOOKUP_F_IFACE))
525                                 return sprt;
526                 }
527         }
528
529         if (oif) {
530                 if (local)
531                         return local;
532
533                 if (flags & RT6_LOOKUP_F_IFACE)
534                         return net->ipv6.ip6_null_entry;
535         }
536 out:
537         return rt;
538 }
539
540 #ifdef CONFIG_IPV6_ROUTER_PREF
541 struct __rt6_probe_work {
542         struct work_struct work;
543         struct in6_addr target;
544         struct net_device *dev;
545 };
546
547 static void rt6_probe_deferred(struct work_struct *w)
548 {
549         struct in6_addr mcaddr;
550         struct __rt6_probe_work *work =
551                 container_of(w, struct __rt6_probe_work, work);
552
553         addrconf_addr_solict_mult(&work->target, &mcaddr);
554         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
555         dev_put(work->dev);
556         kfree(work);
557 }
558
559 static void rt6_probe(struct rt6_info *rt)
560 {
561         struct __rt6_probe_work *work;
562         struct neighbour *neigh;
563         /*
564          * Okay, this does not seem to be appropriate
565          * for now, however, we need to check if it
566          * is really so; aka Router Reachability Probing.
567          *
568          * Router Reachability Probe MUST be rate-limited
569          * to no more than one per minute.
570          */
571         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
572                 return;
573         rcu_read_lock_bh();
574         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
575         if (neigh) {
576                 if (neigh->nud_state & NUD_VALID)
577                         goto out;
578
579                 work = NULL;
580                 write_lock(&neigh->lock);
581                 if (!(neigh->nud_state & NUD_VALID) &&
582                     time_after(jiffies,
583                                neigh->updated +
584                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
585                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
586                         if (work)
587                                 __neigh_set_probe_once(neigh);
588                 }
589                 write_unlock(&neigh->lock);
590         } else {
591                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592         }
593
594         if (work) {
595                 INIT_WORK(&work->work, rt6_probe_deferred);
596                 work->target = rt->rt6i_gateway;
597                 dev_hold(rt->dst.dev);
598                 work->dev = rt->dst.dev;
599                 schedule_work(&work->work);
600         }
601
602 out:
603         rcu_read_unlock_bh();
604 }
605 #else
606 static inline void rt6_probe(struct rt6_info *rt)
607 {
608 }
609 #endif
610
611 /*
612  * Default Router Selection (RFC 2461 6.3.6)
613  */
614 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
615 {
616         struct net_device *dev = rt->dst.dev;
617         if (!oif || dev->ifindex == oif)
618                 return 2;
619         if ((dev->flags & IFF_LOOPBACK) &&
620             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
621                 return 1;
622         return 0;
623 }
624
625 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
626 {
627         struct neighbour *neigh;
628         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
629
630         if (rt->rt6i_flags & RTF_NONEXTHOP ||
631             !(rt->rt6i_flags & RTF_GATEWAY))
632                 return RT6_NUD_SUCCEED;
633
634         rcu_read_lock_bh();
635         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
636         if (neigh) {
637                 read_lock(&neigh->lock);
638                 if (neigh->nud_state & NUD_VALID)
639                         ret = RT6_NUD_SUCCEED;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641                 else if (!(neigh->nud_state & NUD_FAILED))
642                         ret = RT6_NUD_SUCCEED;
643                 else
644                         ret = RT6_NUD_FAIL_PROBE;
645 #endif
646                 read_unlock(&neigh->lock);
647         } else {
648                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
649                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
650         }
651         rcu_read_unlock_bh();
652
653         return ret;
654 }
655
656 static int rt6_score_route(struct rt6_info *rt, int oif,
657                            int strict)
658 {
659         int m;
660
661         m = rt6_check_dev(rt, oif);
662         if (!m && (strict & RT6_LOOKUP_F_IFACE))
663                 return RT6_NUD_FAIL_HARD;
664 #ifdef CONFIG_IPV6_ROUTER_PREF
665         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
666 #endif
667         if (strict & RT6_LOOKUP_F_REACHABLE) {
668                 int n = rt6_check_neigh(rt);
669                 if (n < 0)
670                         return n;
671         }
672         return m;
673 }
674
675 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
676                                    int *mpri, struct rt6_info *match,
677                                    bool *do_rr)
678 {
679         int m;
680         bool match_do_rr = false;
681         struct inet6_dev *idev = rt->rt6i_idev;
682         struct net_device *dev = rt->dst.dev;
683
684         if (dev && !netif_carrier_ok(dev) &&
685             idev->cnf.ignore_routes_with_linkdown &&
686             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
687                 goto out;
688
689         if (rt6_check_expired(rt))
690                 goto out;
691
692         m = rt6_score_route(rt, oif, strict);
693         if (m == RT6_NUD_FAIL_DO_RR) {
694                 match_do_rr = true;
695                 m = 0; /* lowest valid score */
696         } else if (m == RT6_NUD_FAIL_HARD) {
697                 goto out;
698         }
699
700         if (strict & RT6_LOOKUP_F_REACHABLE)
701                 rt6_probe(rt);
702
703         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
704         if (m > *mpri) {
705                 *do_rr = match_do_rr;
706                 *mpri = m;
707                 match = rt;
708         }
709 out:
710         return match;
711 }
712
713 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
714                                      struct rt6_info *leaf,
715                                      struct rt6_info *rr_head,
716                                      u32 metric, int oif, int strict,
717                                      bool *do_rr)
718 {
719         struct rt6_info *rt, *match, *cont;
720         int mpri = -1;
721
722         match = NULL;
723         cont = NULL;
724         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
725                 if (rt->rt6i_metric != metric) {
726                         cont = rt;
727                         break;
728                 }
729
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731         }
732
733         for (rt = leaf; rt && rt != rr_head;
734              rt = rcu_dereference(rt->dst.rt6_next)) {
735                 if (rt->rt6i_metric != metric) {
736                         cont = rt;
737                         break;
738                 }
739
740                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741         }
742
743         if (match || !cont)
744                 return match;
745
746         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
747                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
748
749         return match;
750 }
751
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
753                                    int oif, int strict)
754 {
755         struct rt6_info *leaf = rcu_dereference(fn->leaf);
756         struct rt6_info *match, *rt0;
757         bool do_rr = false;
758         int key_plen;
759
760         if (!leaf || leaf == net->ipv6.ip6_null_entry)
761                 return net->ipv6.ip6_null_entry;
762
763         rt0 = rcu_dereference(fn->rr_ptr);
764         if (!rt0)
765                 rt0 = leaf;
766
767         /* Double check to make sure fn is not an intermediate node
768          * and fn->leaf does not points to its child's leaf
769          * (This might happen if all routes under fn are deleted from
770          * the tree and fib6_repair_tree() is called on the node.)
771          */
772         key_plen = rt0->rt6i_dst.plen;
773 #ifdef CONFIG_IPV6_SUBTREES
774         if (rt0->rt6i_src.plen)
775                 key_plen = rt0->rt6i_src.plen;
776 #endif
777         if (fn->fn_bit != key_plen)
778                 return net->ipv6.ip6_null_entry;
779
780         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
781                              &do_rr);
782
783         if (do_rr) {
784                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
785
786                 /* no entries matched; do round-robin */
787                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
788                         next = leaf;
789
790                 if (next != rt0) {
791                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
792                         /* make sure next is not being deleted from the tree */
793                         if (next->rt6i_node)
794                                 rcu_assign_pointer(fn->rr_ptr, next);
795                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
796                 }
797         }
798
799         return match ? match : net->ipv6.ip6_null_entry;
800 }
801
802 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
803 {
804         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
805 }
806
807 #ifdef CONFIG_IPV6_ROUTE_INFO
808 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
809                   const struct in6_addr *gwaddr)
810 {
811         struct net *net = dev_net(dev);
812         struct route_info *rinfo = (struct route_info *) opt;
813         struct in6_addr prefix_buf, *prefix;
814         unsigned int pref;
815         unsigned long lifetime;
816         struct rt6_info *rt;
817
818         if (len < sizeof(struct route_info)) {
819                 return -EINVAL;
820         }
821
822         /* Sanity check for prefix_len and length */
823         if (rinfo->length > 3) {
824                 return -EINVAL;
825         } else if (rinfo->prefix_len > 128) {
826                 return -EINVAL;
827         } else if (rinfo->prefix_len > 64) {
828                 if (rinfo->length < 2) {
829                         return -EINVAL;
830                 }
831         } else if (rinfo->prefix_len > 0) {
832                 if (rinfo->length < 1) {
833                         return -EINVAL;
834                 }
835         }
836
837         pref = rinfo->route_pref;
838         if (pref == ICMPV6_ROUTER_PREF_INVALID)
839                 return -EINVAL;
840
841         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842
843         if (rinfo->length == 3)
844                 prefix = (struct in6_addr *)rinfo->prefix;
845         else {
846                 /* this function is safe */
847                 ipv6_addr_prefix(&prefix_buf,
848                                  (struct in6_addr *)rinfo->prefix,
849                                  rinfo->prefix_len);
850                 prefix = &prefix_buf;
851         }
852
853         if (rinfo->prefix_len == 0)
854                 rt = rt6_get_dflt_router(gwaddr, dev);
855         else
856                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
857                                         gwaddr, dev);
858
859         if (rt && !lifetime) {
860                 ip6_del_rt(rt);
861                 rt = NULL;
862         }
863
864         if (!rt && lifetime)
865                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
866                                         dev, pref);
867         else if (rt)
868                 rt->rt6i_flags = RTF_ROUTEINFO |
869                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
870
871         if (rt) {
872                 if (!addrconf_finite_timeout(lifetime))
873                         rt6_clean_expires(rt);
874                 else
875                         rt6_set_expires(rt, jiffies + HZ * lifetime);
876
877                 ip6_rt_put(rt);
878         }
879         return 0;
880 }
881 #endif
882
883 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
884                                         struct in6_addr *saddr)
885 {
886         struct fib6_node *pn, *sn;
887         while (1) {
888                 if (fn->fn_flags & RTN_TL_ROOT)
889                         return NULL;
890                 pn = rcu_dereference(fn->parent);
891                 sn = FIB6_SUBTREE(pn);
892                 if (sn && sn != fn)
893                         fn = fib6_lookup(sn, NULL, saddr);
894                 else
895                         fn = pn;
896                 if (fn->fn_flags & RTN_RTINFO)
897                         return fn;
898         }
899 }
900
901 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
902                           bool null_fallback)
903 {
904         struct rt6_info *rt = *prt;
905
906         if (dst_hold_safe(&rt->dst))
907                 return true;
908         if (null_fallback) {
909                 rt = net->ipv6.ip6_null_entry;
910                 dst_hold(&rt->dst);
911         } else {
912                 rt = NULL;
913         }
914         *prt = rt;
915         return false;
916 }
917
918 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
919                                              struct fib6_table *table,
920                                              struct flowi6 *fl6, int flags)
921 {
922         struct rt6_info *rt, *rt_cache;
923         struct fib6_node *fn;
924
925         rcu_read_lock();
926         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
927 restart:
928         rt = rcu_dereference(fn->leaf);
929         if (!rt) {
930                 rt = net->ipv6.ip6_null_entry;
931         } else {
932                 rt = rt6_device_match(net, rt, &fl6->saddr,
933                                       fl6->flowi6_oif, flags);
934                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
935                         rt = rt6_multipath_select(rt, fl6,
936                                                   fl6->flowi6_oif, flags);
937         }
938         if (rt == net->ipv6.ip6_null_entry) {
939                 fn = fib6_backtrack(fn, &fl6->saddr);
940                 if (fn)
941                         goto restart;
942         }
943         /* Search through exception table */
944         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
945         if (rt_cache)
946                 rt = rt_cache;
947
948         if (ip6_hold_safe(net, &rt, true))
949                 dst_use_noref(&rt->dst, jiffies);
950
951         rcu_read_unlock();
952
953         trace_fib6_table_lookup(net, rt, table, fl6);
954
955         return rt;
956
957 }
958
959 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
960                                     int flags)
961 {
962         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
963 }
964 EXPORT_SYMBOL_GPL(ip6_route_lookup);
965
966 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
967                             const struct in6_addr *saddr, int oif, int strict)
968 {
969         struct flowi6 fl6 = {
970                 .flowi6_oif = oif,
971                 .daddr = *daddr,
972         };
973         struct dst_entry *dst;
974         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
975
976         if (saddr) {
977                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
978                 flags |= RT6_LOOKUP_F_HAS_SADDR;
979         }
980
981         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
982         if (dst->error == 0)
983                 return (struct rt6_info *) dst;
984
985         dst_release(dst);
986
987         return NULL;
988 }
989 EXPORT_SYMBOL(rt6_lookup);
990
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992  * It takes new route entry, the addition fails by any reason the
993  * route is released.
994  * Caller must hold dst before calling it.
995  */
996
997 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
998                         struct mx6_config *mxc,
999                         struct netlink_ext_ack *extack)
1000 {
1001         int err;
1002         struct fib6_table *table;
1003
1004         table = rt->rt6i_table;
1005         spin_lock_bh(&table->tb6_lock);
1006         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1007         spin_unlock_bh(&table->tb6_lock);
1008
1009         return err;
1010 }
1011
1012 int ip6_ins_rt(struct rt6_info *rt)
1013 {
1014         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1015         struct mx6_config mxc = { .mx = NULL, };
1016
1017         /* Hold dst to account for the reference from the fib6 tree */
1018         dst_hold(&rt->dst);
1019         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1020 }
1021
1022 /* called with rcu_lock held */
1023 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1024 {
1025         struct net_device *dev = rt->dst.dev;
1026
1027         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1028                 /* for copies of local routes, dst->dev needs to be the
1029                  * device if it is a master device, the master device if
1030                  * device is enslaved, and the loopback as the default
1031                  */
1032                 if (netif_is_l3_slave(dev) &&
1033                     !rt6_need_strict(&rt->rt6i_dst.addr))
1034                         dev = l3mdev_master_dev_rcu(dev);
1035                 else if (!netif_is_l3_master(dev))
1036                         dev = dev_net(dev)->loopback_dev;
1037                 /* last case is netif_is_l3_master(dev) is true in which
1038                  * case we want dev returned to be dev
1039                  */
1040         }
1041
1042         return dev;
1043 }
1044
1045 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1046                                            const struct in6_addr *daddr,
1047                                            const struct in6_addr *saddr)
1048 {
1049         struct net_device *dev;
1050         struct rt6_info *rt;
1051
1052         /*
1053          *      Clone the route.
1054          */
1055
1056         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1057                 ort = (struct rt6_info *)ort->dst.from;
1058
1059         rcu_read_lock();
1060         dev = ip6_rt_get_dev_rcu(ort);
1061         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1062         rcu_read_unlock();
1063         if (!rt)
1064                 return NULL;
1065
1066         ip6_rt_copy_init(rt, ort);
1067         rt->rt6i_flags |= RTF_CACHE;
1068         rt->rt6i_metric = 0;
1069         rt->dst.flags |= DST_HOST;
1070         rt->rt6i_dst.addr = *daddr;
1071         rt->rt6i_dst.plen = 128;
1072
1073         if (!rt6_is_gw_or_nonexthop(ort)) {
1074                 if (ort->rt6i_dst.plen != 128 &&
1075                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1076                         rt->rt6i_flags |= RTF_ANYCAST;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078                 if (rt->rt6i_src.plen && saddr) {
1079                         rt->rt6i_src.addr = *saddr;
1080                         rt->rt6i_src.plen = 128;
1081                 }
1082 #endif
1083         }
1084
1085         return rt;
1086 }
1087
1088 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1089 {
1090         struct net_device *dev;
1091         struct rt6_info *pcpu_rt;
1092
1093         rcu_read_lock();
1094         dev = ip6_rt_get_dev_rcu(rt);
1095         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1096         rcu_read_unlock();
1097         if (!pcpu_rt)
1098                 return NULL;
1099         ip6_rt_copy_init(pcpu_rt, rt);
1100         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1101         pcpu_rt->rt6i_flags |= RTF_PCPU;
1102         return pcpu_rt;
1103 }
1104
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1107 {
1108         struct rt6_info *pcpu_rt, **p;
1109
1110         p = this_cpu_ptr(rt->rt6i_pcpu);
1111         pcpu_rt = *p;
1112
1113         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1114                 rt6_dst_from_metrics_check(pcpu_rt);
1115
1116         return pcpu_rt;
1117 }
1118
1119 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1120 {
1121         struct rt6_info *pcpu_rt, *prev, **p;
1122
1123         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1124         if (!pcpu_rt) {
1125                 struct net *net = dev_net(rt->dst.dev);
1126
1127                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1128                 return net->ipv6.ip6_null_entry;
1129         }
1130
1131         dst_hold(&pcpu_rt->dst);
1132         p = this_cpu_ptr(rt->rt6i_pcpu);
1133         prev = cmpxchg(p, NULL, pcpu_rt);
1134         BUG_ON(prev);
1135
1136         rt6_dst_from_metrics_check(pcpu_rt);
1137         return pcpu_rt;
1138 }
1139
1140 /* exception hash table implementation
1141  */
1142 static DEFINE_SPINLOCK(rt6_exception_lock);
1143
1144 /* Remove rt6_ex from hash table and free the memory
1145  * Caller must hold rt6_exception_lock
1146  */
1147 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1148                                  struct rt6_exception *rt6_ex)
1149 {
1150         struct net *net;
1151
1152         if (!bucket || !rt6_ex)
1153                 return;
1154
1155         net = dev_net(rt6_ex->rt6i->dst.dev);
1156         rt6_ex->rt6i->rt6i_node = NULL;
1157         hlist_del_rcu(&rt6_ex->hlist);
1158         rt6_release(rt6_ex->rt6i);
1159         kfree_rcu(rt6_ex, rcu);
1160         WARN_ON_ONCE(!bucket->depth);
1161         bucket->depth--;
1162         net->ipv6.rt6_stats->fib_rt_cache--;
1163 }
1164
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166  * Caller must hold rt6_exception_lock
1167  */
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1169 {
1170         struct rt6_exception *rt6_ex, *oldest = NULL;
1171
1172         if (!bucket)
1173                 return;
1174
1175         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1176                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1177                         oldest = rt6_ex;
1178         }
1179         rt6_remove_exception(bucket, oldest);
1180 }
1181
1182 static u32 rt6_exception_hash(const struct in6_addr *dst,
1183                               const struct in6_addr *src)
1184 {
1185         static u32 seed __read_mostly;
1186         u32 val;
1187
1188         net_get_random_once(&seed, sizeof(seed));
1189         val = jhash(dst, sizeof(*dst), seed);
1190
1191 #ifdef CONFIG_IPV6_SUBTREES
1192         if (src)
1193                 val = jhash(src, sizeof(*src), val);
1194 #endif
1195         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1196 }
1197
1198 /* Helper function to find the cached rt in the hash table
1199  * and update bucket pointer to point to the bucket for this
1200  * (daddr, saddr) pair
1201  * Caller must hold rt6_exception_lock
1202  */
1203 static struct rt6_exception *
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1205                               const struct in6_addr *daddr,
1206                               const struct in6_addr *saddr)
1207 {
1208         struct rt6_exception *rt6_ex;
1209         u32 hval;
1210
1211         if (!(*bucket) || !daddr)
1212                 return NULL;
1213
1214         hval = rt6_exception_hash(daddr, saddr);
1215         *bucket += hval;
1216
1217         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1218                 struct rt6_info *rt6 = rt6_ex->rt6i;
1219                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1220
1221 #ifdef CONFIG_IPV6_SUBTREES
1222                 if (matched && saddr)
1223                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 #endif
1225                 if (matched)
1226                         return rt6_ex;
1227         }
1228         return NULL;
1229 }
1230
1231 /* Helper function to find the cached rt in the hash table
1232  * and update bucket pointer to point to the bucket for this
1233  * (daddr, saddr) pair
1234  * Caller must hold rcu_read_lock()
1235  */
1236 static struct rt6_exception *
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1238                          const struct in6_addr *daddr,
1239                          const struct in6_addr *saddr)
1240 {
1241         struct rt6_exception *rt6_ex;
1242         u32 hval;
1243
1244         WARN_ON_ONCE(!rcu_read_lock_held());
1245
1246         if (!(*bucket) || !daddr)
1247                 return NULL;
1248
1249         hval = rt6_exception_hash(daddr, saddr);
1250         *bucket += hval;
1251
1252         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1253                 struct rt6_info *rt6 = rt6_ex->rt6i;
1254                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1255
1256 #ifdef CONFIG_IPV6_SUBTREES
1257                 if (matched && saddr)
1258                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 #endif
1260                 if (matched)
1261                         return rt6_ex;
1262         }
1263         return NULL;
1264 }
1265
1266 static int rt6_insert_exception(struct rt6_info *nrt,
1267                                 struct rt6_info *ort)
1268 {
1269         struct net *net = dev_net(ort->dst.dev);
1270         struct rt6_exception_bucket *bucket;
1271         struct in6_addr *src_key = NULL;
1272         struct rt6_exception *rt6_ex;
1273         int err = 0;
1274
1275         /* ort can't be a cache or pcpu route */
1276         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1277                 ort = (struct rt6_info *)ort->dst.from;
1278         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1279
1280         spin_lock_bh(&rt6_exception_lock);
1281
1282         if (ort->exception_bucket_flushed) {
1283                 err = -EINVAL;
1284                 goto out;
1285         }
1286
1287         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1288                                         lockdep_is_held(&rt6_exception_lock));
1289         if (!bucket) {
1290                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1291                                  GFP_ATOMIC);
1292                 if (!bucket) {
1293                         err = -ENOMEM;
1294                         goto out;
1295                 }
1296                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1297         }
1298
1299 #ifdef CONFIG_IPV6_SUBTREES
1300         /* rt6i_src.plen != 0 indicates ort is in subtree
1301          * and exception table is indexed by a hash of
1302          * both rt6i_dst and rt6i_src.
1303          * Otherwise, the exception table is indexed by
1304          * a hash of only rt6i_dst.
1305          */
1306         if (ort->rt6i_src.plen)
1307                 src_key = &nrt->rt6i_src.addr;
1308 #endif
1309
1310         /* Update rt6i_prefsrc as it could be changed
1311          * in rt6_remove_prefsrc()
1312          */
1313         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1314         /* rt6_mtu_change() might lower mtu on ort.
1315          * Only insert this exception route if its mtu
1316          * is less than ort's mtu value.
1317          */
1318         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1319                 err = -EINVAL;
1320                 goto out;
1321         }
1322
1323         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1324                                                src_key);
1325         if (rt6_ex)
1326                 rt6_remove_exception(bucket, rt6_ex);
1327
1328         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1329         if (!rt6_ex) {
1330                 err = -ENOMEM;
1331                 goto out;
1332         }
1333         rt6_ex->rt6i = nrt;
1334         rt6_ex->stamp = jiffies;
1335         atomic_inc(&nrt->rt6i_ref);
1336         nrt->rt6i_node = ort->rt6i_node;
1337         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1338         bucket->depth++;
1339         net->ipv6.rt6_stats->fib_rt_cache++;
1340
1341         if (bucket->depth > FIB6_MAX_DEPTH)
1342                 rt6_exception_remove_oldest(bucket);
1343
1344 out:
1345         spin_unlock_bh(&rt6_exception_lock);
1346
1347         /* Update fn->fn_sernum to invalidate all cached dst */
1348         if (!err) {
1349                 fib6_update_sernum(ort);
1350                 fib6_force_start_gc(net);
1351         }
1352
1353         return err;
1354 }
1355
1356 void rt6_flush_exceptions(struct rt6_info *rt)
1357 {
1358         struct rt6_exception_bucket *bucket;
1359         struct rt6_exception *rt6_ex;
1360         struct hlist_node *tmp;
1361         int i;
1362
1363         spin_lock_bh(&rt6_exception_lock);
1364         /* Prevent rt6_insert_exception() to recreate the bucket list */
1365         rt->exception_bucket_flushed = 1;
1366
1367         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1368                                     lockdep_is_held(&rt6_exception_lock));
1369         if (!bucket)
1370                 goto out;
1371
1372         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1373                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1374                         rt6_remove_exception(bucket, rt6_ex);
1375                 WARN_ON_ONCE(bucket->depth);
1376                 bucket++;
1377         }
1378
1379 out:
1380         spin_unlock_bh(&rt6_exception_lock);
1381 }
1382
1383 /* Find cached rt in the hash table inside passed in rt
1384  * Caller has to hold rcu_read_lock()
1385  */
1386 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1387                                            struct in6_addr *daddr,
1388                                            struct in6_addr *saddr)
1389 {
1390         struct rt6_exception_bucket *bucket;
1391         struct in6_addr *src_key = NULL;
1392         struct rt6_exception *rt6_ex;
1393         struct rt6_info *res = NULL;
1394
1395         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1396
1397 #ifdef CONFIG_IPV6_SUBTREES
1398         /* rt6i_src.plen != 0 indicates rt is in subtree
1399          * and exception table is indexed by a hash of
1400          * both rt6i_dst and rt6i_src.
1401          * Otherwise, the exception table is indexed by
1402          * a hash of only rt6i_dst.
1403          */
1404         if (rt->rt6i_src.plen)
1405                 src_key = saddr;
1406 #endif
1407         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1408
1409         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1410                 res = rt6_ex->rt6i;
1411
1412         return res;
1413 }
1414
1415 /* Remove the passed in cached rt from the hash table that contains it */
1416 int rt6_remove_exception_rt(struct rt6_info *rt)
1417 {
1418         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1419         struct rt6_exception_bucket *bucket;
1420         struct in6_addr *src_key = NULL;
1421         struct rt6_exception *rt6_ex;
1422         int err;
1423
1424         if (!from ||
1425             !(rt->rt6i_flags & RTF_CACHE))
1426                 return -EINVAL;
1427
1428         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1429                 return -ENOENT;
1430
1431         spin_lock_bh(&rt6_exception_lock);
1432         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1433                                     lockdep_is_held(&rt6_exception_lock));
1434 #ifdef CONFIG_IPV6_SUBTREES
1435         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1436          * and exception table is indexed by a hash of
1437          * both rt6i_dst and rt6i_src.
1438          * Otherwise, the exception table is indexed by
1439          * a hash of only rt6i_dst.
1440          */
1441         if (from->rt6i_src.plen)
1442                 src_key = &rt->rt6i_src.addr;
1443 #endif
1444         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1445                                                &rt->rt6i_dst.addr,
1446                                                src_key);
1447         if (rt6_ex) {
1448                 rt6_remove_exception(bucket, rt6_ex);
1449                 err = 0;
1450         } else {
1451                 err = -ENOENT;
1452         }
1453
1454         spin_unlock_bh(&rt6_exception_lock);
1455         return err;
1456 }
1457
1458 /* Find rt6_ex which contains the passed in rt cache and
1459  * refresh its stamp
1460  */
1461 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1462 {
1463         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1464         struct rt6_exception_bucket *bucket;
1465         struct in6_addr *src_key = NULL;
1466         struct rt6_exception *rt6_ex;
1467
1468         if (!from ||
1469             !(rt->rt6i_flags & RTF_CACHE))
1470                 return;
1471
1472         rcu_read_lock();
1473         bucket = rcu_dereference(from->rt6i_exception_bucket);
1474
1475 #ifdef CONFIG_IPV6_SUBTREES
1476         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1477          * and exception table is indexed by a hash of
1478          * both rt6i_dst and rt6i_src.
1479          * Otherwise, the exception table is indexed by
1480          * a hash of only rt6i_dst.
1481          */
1482         if (from->rt6i_src.plen)
1483                 src_key = &rt->rt6i_src.addr;
1484 #endif
1485         rt6_ex = __rt6_find_exception_rcu(&bucket,
1486                                           &rt->rt6i_dst.addr,
1487                                           src_key);
1488         if (rt6_ex)
1489                 rt6_ex->stamp = jiffies;
1490
1491         rcu_read_unlock();
1492 }
1493
1494 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1495 {
1496         struct rt6_exception_bucket *bucket;
1497         struct rt6_exception *rt6_ex;
1498         int i;
1499
1500         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1501                                         lockdep_is_held(&rt6_exception_lock));
1502
1503         if (bucket) {
1504                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1506                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1507                         }
1508                         bucket++;
1509                 }
1510         }
1511 }
1512
1513 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1514 {
1515         struct rt6_exception_bucket *bucket;
1516         struct rt6_exception *rt6_ex;
1517         int i;
1518
1519         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1520                                         lockdep_is_held(&rt6_exception_lock));
1521
1522         if (bucket) {
1523                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1525                                 struct rt6_info *entry = rt6_ex->rt6i;
1526                                 /* For RTF_CACHE with rt6i_pmtu == 0
1527                                  * (i.e. a redirected route),
1528                                  * the metrics of its rt->dst.from has already
1529                                  * been updated.
1530                                  */
1531                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1532                                         entry->rt6i_pmtu = mtu;
1533                         }
1534                         bucket++;
1535                 }
1536         }
1537 }
1538
1539 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1540
1541 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1542                                         struct in6_addr *gateway)
1543 {
1544         struct rt6_exception_bucket *bucket;
1545         struct rt6_exception *rt6_ex;
1546         struct hlist_node *tmp;
1547         int i;
1548
1549         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1550                 return;
1551
1552         spin_lock_bh(&rt6_exception_lock);
1553         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554                                      lockdep_is_held(&rt6_exception_lock));
1555
1556         if (bucket) {
1557                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558                         hlist_for_each_entry_safe(rt6_ex, tmp,
1559                                                   &bucket->chain, hlist) {
1560                                 struct rt6_info *entry = rt6_ex->rt6i;
1561
1562                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1563                                     RTF_CACHE_GATEWAY &&
1564                                     ipv6_addr_equal(gateway,
1565                                                     &entry->rt6i_gateway)) {
1566                                         rt6_remove_exception(bucket, rt6_ex);
1567                                 }
1568                         }
1569                         bucket++;
1570                 }
1571         }
1572
1573         spin_unlock_bh(&rt6_exception_lock);
1574 }
1575
1576 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1577                                       struct rt6_exception *rt6_ex,
1578                                       struct fib6_gc_args *gc_args,
1579                                       unsigned long now)
1580 {
1581         struct rt6_info *rt = rt6_ex->rt6i;
1582
1583         /* we are pruning and obsoleting aged-out and non gateway exceptions
1584          * even if others have still references to them, so that on next
1585          * dst_check() such references can be dropped.
1586          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1587          * expired, independently from their aging, as per RFC 8201 section 4
1588          */
1589         if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1590             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1591                 RT6_TRACE("aging clone %p\n", rt);
1592                 rt6_remove_exception(bucket, rt6_ex);
1593                 return;
1594         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1595                 struct neighbour *neigh;
1596                 __u8 neigh_flags = 0;
1597
1598                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1599                 if (neigh) {
1600                         neigh_flags = neigh->flags;
1601                         neigh_release(neigh);
1602                 }
1603                 if (!(neigh_flags & NTF_ROUTER)) {
1604                         RT6_TRACE("purging route %p via non-router but gateway\n",
1605                                   rt);
1606                         rt6_remove_exception(bucket, rt6_ex);
1607                         return;
1608                 }
1609         } else if (__rt6_check_expired(rt)) {
1610                 RT6_TRACE("purging expired route %p\n", rt);
1611                 rt6_remove_exception(bucket, rt6_ex);
1612                 return;
1613         }
1614         gc_args->more++;
1615 }
1616
1617 void rt6_age_exceptions(struct rt6_info *rt,
1618                         struct fib6_gc_args *gc_args,
1619                         unsigned long now)
1620 {
1621         struct rt6_exception_bucket *bucket;
1622         struct rt6_exception *rt6_ex;
1623         struct hlist_node *tmp;
1624         int i;
1625
1626         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1627                 return;
1628
1629         spin_lock_bh(&rt6_exception_lock);
1630         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1631                                     lockdep_is_held(&rt6_exception_lock));
1632
1633         if (bucket) {
1634                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1635                         hlist_for_each_entry_safe(rt6_ex, tmp,
1636                                                   &bucket->chain, hlist) {
1637                                 rt6_age_examine_exception(bucket, rt6_ex,
1638                                                           gc_args, now);
1639                         }
1640                         bucket++;
1641                 }
1642         }
1643         spin_unlock_bh(&rt6_exception_lock);
1644 }
1645
1646 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1647                                int oif, struct flowi6 *fl6, int flags)
1648 {
1649         struct fib6_node *fn, *saved_fn;
1650         struct rt6_info *rt, *rt_cache;
1651         int strict = 0;
1652
1653         strict |= flags & RT6_LOOKUP_F_IFACE;
1654         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1655         if (net->ipv6.devconf_all->forwarding == 0)
1656                 strict |= RT6_LOOKUP_F_REACHABLE;
1657
1658         rcu_read_lock();
1659
1660         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1661         saved_fn = fn;
1662
1663         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1664                 oif = 0;
1665
1666 redo_rt6_select:
1667         rt = rt6_select(net, fn, oif, strict);
1668         if (rt->rt6i_nsiblings)
1669                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1670         if (rt == net->ipv6.ip6_null_entry) {
1671                 fn = fib6_backtrack(fn, &fl6->saddr);
1672                 if (fn)
1673                         goto redo_rt6_select;
1674                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1675                         /* also consider unreachable route */
1676                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1677                         fn = saved_fn;
1678                         goto redo_rt6_select;
1679                 }
1680         }
1681
1682         /*Search through exception table */
1683         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1684         if (rt_cache)
1685                 rt = rt_cache;
1686
1687         if (rt == net->ipv6.ip6_null_entry) {
1688                 rcu_read_unlock();
1689                 dst_hold(&rt->dst);
1690                 trace_fib6_table_lookup(net, rt, table, fl6);
1691                 return rt;
1692         } else if (rt->rt6i_flags & RTF_CACHE) {
1693                 if (ip6_hold_safe(net, &rt, true)) {
1694                         dst_use_noref(&rt->dst, jiffies);
1695                         rt6_dst_from_metrics_check(rt);
1696                 }
1697                 rcu_read_unlock();
1698                 trace_fib6_table_lookup(net, rt, table, fl6);
1699                 return rt;
1700         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1701                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1702                 /* Create a RTF_CACHE clone which will not be
1703                  * owned by the fib6 tree.  It is for the special case where
1704                  * the daddr in the skb during the neighbor look-up is different
1705                  * from the fl6->daddr used to look-up route here.
1706                  */
1707
1708                 struct rt6_info *uncached_rt;
1709
1710                 if (ip6_hold_safe(net, &rt, true)) {
1711                         dst_use_noref(&rt->dst, jiffies);
1712                 } else {
1713                         rcu_read_unlock();
1714                         uncached_rt = rt;
1715                         goto uncached_rt_out;
1716                 }
1717                 rcu_read_unlock();
1718
1719                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1720                 dst_release(&rt->dst);
1721
1722                 if (uncached_rt) {
1723                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1724                          * No need for another dst_hold()
1725                          */
1726                         rt6_uncached_list_add(uncached_rt);
1727                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1728                 } else {
1729                         uncached_rt = net->ipv6.ip6_null_entry;
1730                         dst_hold(&uncached_rt->dst);
1731                 }
1732
1733 uncached_rt_out:
1734                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1735                 return uncached_rt;
1736
1737         } else {
1738                 /* Get a percpu copy */
1739
1740                 struct rt6_info *pcpu_rt;
1741
1742                 dst_use_noref(&rt->dst, jiffies);
1743                 local_bh_disable();
1744                 pcpu_rt = rt6_get_pcpu_route(rt);
1745
1746                 if (!pcpu_rt) {
1747                         /* atomic_inc_not_zero() is needed when using rcu */
1748                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1749                                 /* No dst_hold() on rt is needed because grabbing
1750                                  * rt->rt6i_ref makes sure rt can't be released.
1751                                  */
1752                                 pcpu_rt = rt6_make_pcpu_route(rt);
1753                                 rt6_release(rt);
1754                         } else {
1755                                 /* rt is already removed from tree */
1756                                 pcpu_rt = net->ipv6.ip6_null_entry;
1757                                 dst_hold(&pcpu_rt->dst);
1758                         }
1759                 }
1760                 local_bh_enable();
1761                 rcu_read_unlock();
1762                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1763                 return pcpu_rt;
1764         }
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_pol_route);
1767
1768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1769                                             struct flowi6 *fl6, int flags)
1770 {
1771         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1772 }
1773
1774 struct dst_entry *ip6_route_input_lookup(struct net *net,
1775                                          struct net_device *dev,
1776                                          struct flowi6 *fl6, int flags)
1777 {
1778         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1779                 flags |= RT6_LOOKUP_F_IFACE;
1780
1781         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1782 }
1783 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1784
1785 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1786                                   struct flow_keys *keys)
1787 {
1788         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1789         const struct ipv6hdr *key_iph = outer_iph;
1790         const struct ipv6hdr *inner_iph;
1791         const struct icmp6hdr *icmph;
1792         struct ipv6hdr _inner_iph;
1793
1794         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1795                 goto out;
1796
1797         icmph = icmp6_hdr(skb);
1798         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1799             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1800             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1801             icmph->icmp6_type != ICMPV6_PARAMPROB)
1802                 goto out;
1803
1804         inner_iph = skb_header_pointer(skb,
1805                                        skb_transport_offset(skb) + sizeof(*icmph),
1806                                        sizeof(_inner_iph), &_inner_iph);
1807         if (!inner_iph)
1808                 goto out;
1809
1810         key_iph = inner_iph;
1811 out:
1812         memset(keys, 0, sizeof(*keys));
1813         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1814         keys->addrs.v6addrs.src = key_iph->saddr;
1815         keys->addrs.v6addrs.dst = key_iph->daddr;
1816         keys->tags.flow_label = ip6_flowinfo(key_iph);
1817         keys->basic.ip_proto = key_iph->nexthdr;
1818 }
1819
1820 /* if skb is set it will be used and fl6 can be NULL */
1821 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1822 {
1823         struct flow_keys hash_keys;
1824
1825         if (skb) {
1826                 ip6_multipath_l3_keys(skb, &hash_keys);
1827                 return flow_hash_from_keys(&hash_keys);
1828         }
1829
1830         return get_hash_from_flowi6(fl6);
1831 }
1832
1833 void ip6_route_input(struct sk_buff *skb)
1834 {
1835         const struct ipv6hdr *iph = ipv6_hdr(skb);
1836         struct net *net = dev_net(skb->dev);
1837         int flags = RT6_LOOKUP_F_HAS_SADDR;
1838         struct ip_tunnel_info *tun_info;
1839         struct flowi6 fl6 = {
1840                 .flowi6_iif = skb->dev->ifindex,
1841                 .daddr = iph->daddr,
1842                 .saddr = iph->saddr,
1843                 .flowlabel = ip6_flowinfo(iph),
1844                 .flowi6_mark = skb->mark,
1845                 .flowi6_proto = iph->nexthdr,
1846         };
1847
1848         tun_info = skb_tunnel_info(skb);
1849         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1850                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1851         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1852                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1853         skb_dst_drop(skb);
1854         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1855 }
1856
1857 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1858                                              struct flowi6 *fl6, int flags)
1859 {
1860         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1861 }
1862
1863 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1864                                          struct flowi6 *fl6, int flags)
1865 {
1866         bool any_src;
1867
1868         if (rt6_need_strict(&fl6->daddr)) {
1869                 struct dst_entry *dst;
1870
1871                 dst = l3mdev_link_scope_lookup(net, fl6);
1872                 if (dst)
1873                         return dst;
1874         }
1875
1876         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1877
1878         any_src = ipv6_addr_any(&fl6->saddr);
1879         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1880             (fl6->flowi6_oif && any_src))
1881                 flags |= RT6_LOOKUP_F_IFACE;
1882
1883         if (!any_src)
1884                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1885         else if (sk)
1886                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1887
1888         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1889 }
1890 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1891
1892 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1893 {
1894         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1895         struct net_device *loopback_dev = net->loopback_dev;
1896         struct dst_entry *new = NULL;
1897
1898         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1899                        DST_OBSOLETE_DEAD, 0);
1900         if (rt) {
1901                 rt6_info_init(rt);
1902                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1903
1904                 new = &rt->dst;
1905                 new->__use = 1;
1906                 new->input = dst_discard;
1907                 new->output = dst_discard_out;
1908
1909                 dst_copy_metrics(new, &ort->dst);
1910
1911                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1912                 rt->rt6i_gateway = ort->rt6i_gateway;
1913                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1914                 rt->rt6i_metric = 0;
1915
1916                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1917 #ifdef CONFIG_IPV6_SUBTREES
1918                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1919 #endif
1920         }
1921
1922         dst_release(dst_orig);
1923         return new ? new : ERR_PTR(-ENOMEM);
1924 }
1925
1926 /*
1927  *      Destination cache support functions
1928  */
1929
1930 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1931 {
1932         if (rt->dst.from &&
1933             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1934                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1935 }
1936
1937 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1938 {
1939         u32 rt_cookie = 0;
1940
1941         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1942                 return NULL;
1943
1944         if (rt6_check_expired(rt))
1945                 return NULL;
1946
1947         return &rt->dst;
1948 }
1949
1950 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1951 {
1952         if (!__rt6_check_expired(rt) &&
1953             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1954             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1955                 return &rt->dst;
1956         else
1957                 return NULL;
1958 }
1959
1960 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1961 {
1962         struct rt6_info *rt;
1963
1964         rt = (struct rt6_info *) dst;
1965
1966         /* All IPV6 dsts are created with ->obsolete set to the value
1967          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1968          * into this function always.
1969          */
1970
1971         rt6_dst_from_metrics_check(rt);
1972
1973         if (rt->rt6i_flags & RTF_PCPU ||
1974             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1975                 return rt6_dst_from_check(rt, cookie);
1976         else
1977                 return rt6_check(rt, cookie);
1978 }
1979
1980 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1981 {
1982         struct rt6_info *rt = (struct rt6_info *) dst;
1983
1984         if (rt) {
1985                 if (rt->rt6i_flags & RTF_CACHE) {
1986                         if (rt6_check_expired(rt)) {
1987                                 ip6_del_rt(rt);
1988                                 dst = NULL;
1989                         }
1990                 } else {
1991                         dst_release(dst);
1992                         dst = NULL;
1993                 }
1994         }
1995         return dst;
1996 }
1997
1998 static void ip6_link_failure(struct sk_buff *skb)
1999 {
2000         struct rt6_info *rt;
2001
2002         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2003
2004         rt = (struct rt6_info *) skb_dst(skb);
2005         if (rt) {
2006                 if (rt->rt6i_flags & RTF_CACHE) {
2007                         if (dst_hold_safe(&rt->dst))
2008                                 ip6_del_rt(rt);
2009                 } else {
2010                         struct fib6_node *fn;
2011
2012                         rcu_read_lock();
2013                         fn = rcu_dereference(rt->rt6i_node);
2014                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2015                                 fn->fn_sernum = -1;
2016                         rcu_read_unlock();
2017                 }
2018         }
2019 }
2020
2021 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2022 {
2023         struct net *net = dev_net(rt->dst.dev);
2024
2025         rt->rt6i_flags |= RTF_MODIFIED;
2026         rt->rt6i_pmtu = mtu;
2027         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2028 }
2029
2030 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2031 {
2032         return !(rt->rt6i_flags & RTF_CACHE) &&
2033                 (rt->rt6i_flags & RTF_PCPU ||
2034                  rcu_access_pointer(rt->rt6i_node));
2035 }
2036
2037 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2038                                  const struct ipv6hdr *iph, u32 mtu)
2039 {
2040         const struct in6_addr *daddr, *saddr;
2041         struct rt6_info *rt6 = (struct rt6_info *)dst;
2042
2043         if (rt6->rt6i_flags & RTF_LOCAL)
2044                 return;
2045
2046         if (dst_metric_locked(dst, RTAX_MTU))
2047                 return;
2048
2049         if (iph) {
2050                 daddr = &iph->daddr;
2051                 saddr = &iph->saddr;
2052         } else if (sk) {
2053                 daddr = &sk->sk_v6_daddr;
2054                 saddr = &inet6_sk(sk)->saddr;
2055         } else {
2056                 daddr = NULL;
2057                 saddr = NULL;
2058         }
2059         dst_confirm_neigh(dst, daddr);
2060         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2061         if (mtu >= dst_mtu(dst))
2062                 return;
2063
2064         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2065                 rt6_do_update_pmtu(rt6, mtu);
2066                 /* update rt6_ex->stamp for cache */
2067                 if (rt6->rt6i_flags & RTF_CACHE)
2068                         rt6_update_exception_stamp_rt(rt6);
2069         } else if (daddr) {
2070                 struct rt6_info *nrt6;
2071
2072                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2073                 if (nrt6) {
2074                         rt6_do_update_pmtu(nrt6, mtu);
2075                         if (rt6_insert_exception(nrt6, rt6))
2076                                 dst_release_immediate(&nrt6->dst);
2077                 }
2078         }
2079 }
2080
2081 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2082                                struct sk_buff *skb, u32 mtu)
2083 {
2084         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2085 }
2086
2087 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2088                      int oif, u32 mark, kuid_t uid)
2089 {
2090         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2091         struct dst_entry *dst;
2092         struct flowi6 fl6;
2093
2094         memset(&fl6, 0, sizeof(fl6));
2095         fl6.flowi6_oif = oif;
2096         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2097         fl6.daddr = iph->daddr;
2098         fl6.saddr = iph->saddr;
2099         fl6.flowlabel = ip6_flowinfo(iph);
2100         fl6.flowi6_uid = uid;
2101
2102         dst = ip6_route_output(net, NULL, &fl6);
2103         if (!dst->error)
2104                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2105         dst_release(dst);
2106 }
2107 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2108
2109 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2110 {
2111         struct dst_entry *dst;
2112
2113         ip6_update_pmtu(skb, sock_net(sk), mtu,
2114                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2115
2116         dst = __sk_dst_get(sk);
2117         if (!dst || !dst->obsolete ||
2118             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2119                 return;
2120
2121         bh_lock_sock(sk);
2122         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2123                 ip6_datagram_dst_update(sk, false);
2124         bh_unlock_sock(sk);
2125 }
2126 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2127
2128 /* Handle redirects */
2129 struct ip6rd_flowi {
2130         struct flowi6 fl6;
2131         struct in6_addr gateway;
2132 };
2133
2134 static struct rt6_info *__ip6_route_redirect(struct net *net,
2135                                              struct fib6_table *table,
2136                                              struct flowi6 *fl6,
2137                                              int flags)
2138 {
2139         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2140         struct rt6_info *rt, *rt_cache;
2141         struct fib6_node *fn;
2142
2143         /* Get the "current" route for this destination and
2144          * check if the redirect has come from appropriate router.
2145          *
2146          * RFC 4861 specifies that redirects should only be
2147          * accepted if they come from the nexthop to the target.
2148          * Due to the way the routes are chosen, this notion
2149          * is a bit fuzzy and one might need to check all possible
2150          * routes.
2151          */
2152
2153         rcu_read_lock();
2154         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2155 restart:
2156         for_each_fib6_node_rt_rcu(fn) {
2157                 if (rt6_check_expired(rt))
2158                         continue;
2159                 if (rt->dst.error)
2160                         break;
2161                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2162                         continue;
2163                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2164                         continue;
2165                 /* rt_cache's gateway might be different from its 'parent'
2166                  * in the case of an ip redirect.
2167                  * So we keep searching in the exception table if the gateway
2168                  * is different.
2169                  */
2170                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2171                         rt_cache = rt6_find_cached_rt(rt,
2172                                                       &fl6->daddr,
2173                                                       &fl6->saddr);
2174                         if (rt_cache &&
2175                             ipv6_addr_equal(&rdfl->gateway,
2176                                             &rt_cache->rt6i_gateway)) {
2177                                 rt = rt_cache;
2178                                 break;
2179                         }
2180                         continue;
2181                 }
2182                 break;
2183         }
2184
2185         if (!rt)
2186                 rt = net->ipv6.ip6_null_entry;
2187         else if (rt->dst.error) {
2188                 rt = net->ipv6.ip6_null_entry;
2189                 goto out;
2190         }
2191
2192         if (rt == net->ipv6.ip6_null_entry) {
2193                 fn = fib6_backtrack(fn, &fl6->saddr);
2194                 if (fn)
2195                         goto restart;
2196         }
2197
2198 out:
2199         ip6_hold_safe(net, &rt, true);
2200
2201         rcu_read_unlock();
2202
2203         trace_fib6_table_lookup(net, rt, table, fl6);
2204         return rt;
2205 };
2206
2207 static struct dst_entry *ip6_route_redirect(struct net *net,
2208                                         const struct flowi6 *fl6,
2209                                         const struct in6_addr *gateway)
2210 {
2211         int flags = RT6_LOOKUP_F_HAS_SADDR;
2212         struct ip6rd_flowi rdfl;
2213
2214         rdfl.fl6 = *fl6;
2215         rdfl.gateway = *gateway;
2216
2217         return fib6_rule_lookup(net, &rdfl.fl6,
2218                                 flags, __ip6_route_redirect);
2219 }
2220
2221 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2222                   kuid_t uid)
2223 {
2224         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2225         struct dst_entry *dst;
2226         struct flowi6 fl6;
2227
2228         memset(&fl6, 0, sizeof(fl6));
2229         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2230         fl6.flowi6_oif = oif;
2231         fl6.flowi6_mark = mark;
2232         fl6.daddr = iph->daddr;
2233         fl6.saddr = iph->saddr;
2234         fl6.flowlabel = ip6_flowinfo(iph);
2235         fl6.flowi6_uid = uid;
2236
2237         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2238         rt6_do_redirect(dst, NULL, skb);
2239         dst_release(dst);
2240 }
2241 EXPORT_SYMBOL_GPL(ip6_redirect);
2242
2243 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2244                             u32 mark)
2245 {
2246         const struct ipv6hdr *iph = ipv6_hdr(skb);
2247         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2248         struct dst_entry *dst;
2249         struct flowi6 fl6;
2250
2251         memset(&fl6, 0, sizeof(fl6));
2252         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2253         fl6.flowi6_oif = oif;
2254         fl6.flowi6_mark = mark;
2255         fl6.daddr = msg->dest;
2256         fl6.saddr = iph->daddr;
2257         fl6.flowi6_uid = sock_net_uid(net, NULL);
2258
2259         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2260         rt6_do_redirect(dst, NULL, skb);
2261         dst_release(dst);
2262 }
2263
2264 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2265 {
2266         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2267                      sk->sk_uid);
2268 }
2269 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2270
2271 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2272 {
2273         struct net_device *dev = dst->dev;
2274         unsigned int mtu = dst_mtu(dst);
2275         struct net *net = dev_net(dev);
2276
2277         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2278
2279         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2280                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2281
2282         /*
2283          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2284          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2285          * IPV6_MAXPLEN is also valid and means: "any MSS,
2286          * rely only on pmtu discovery"
2287          */
2288         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2289                 mtu = IPV6_MAXPLEN;
2290         return mtu;
2291 }
2292
2293 static unsigned int ip6_mtu(const struct dst_entry *dst)
2294 {
2295         const struct rt6_info *rt = (const struct rt6_info *)dst;
2296         unsigned int mtu = rt->rt6i_pmtu;
2297         struct inet6_dev *idev;
2298
2299         if (mtu)
2300                 goto out;
2301
2302         mtu = dst_metric_raw(dst, RTAX_MTU);
2303         if (mtu)
2304                 goto out;
2305
2306         mtu = IPV6_MIN_MTU;
2307
2308         rcu_read_lock();
2309         idev = __in6_dev_get(dst->dev);
2310         if (idev)
2311                 mtu = idev->cnf.mtu6;
2312         rcu_read_unlock();
2313
2314 out:
2315         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2316
2317         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2318 }
2319
2320 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2321                                   struct flowi6 *fl6)
2322 {
2323         struct dst_entry *dst;
2324         struct rt6_info *rt;
2325         struct inet6_dev *idev = in6_dev_get(dev);
2326         struct net *net = dev_net(dev);
2327
2328         if (unlikely(!idev))
2329                 return ERR_PTR(-ENODEV);
2330
2331         rt = ip6_dst_alloc(net, dev, 0);
2332         if (unlikely(!rt)) {
2333                 in6_dev_put(idev);
2334                 dst = ERR_PTR(-ENOMEM);
2335                 goto out;
2336         }
2337
2338         rt->dst.flags |= DST_HOST;
2339         rt->dst.output  = ip6_output;
2340         rt->rt6i_gateway  = fl6->daddr;
2341         rt->rt6i_dst.addr = fl6->daddr;
2342         rt->rt6i_dst.plen = 128;
2343         rt->rt6i_idev     = idev;
2344         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2345
2346         /* Add this dst into uncached_list so that rt6_ifdown() can
2347          * do proper release of the net_device
2348          */
2349         rt6_uncached_list_add(rt);
2350         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2351
2352         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2353
2354 out:
2355         return dst;
2356 }
2357
2358 static int ip6_dst_gc(struct dst_ops *ops)
2359 {
2360         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2361         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2362         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2363         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2364         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2365         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2366         int entries;
2367
2368         entries = dst_entries_get_fast(ops);
2369         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2370             entries <= rt_max_size)
2371                 goto out;
2372
2373         net->ipv6.ip6_rt_gc_expire++;
2374         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2375         entries = dst_entries_get_slow(ops);
2376         if (entries < ops->gc_thresh)
2377                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2378 out:
2379         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2380         return entries > rt_max_size;
2381 }
2382
2383 static int ip6_convert_metrics(struct mx6_config *mxc,
2384                                const struct fib6_config *cfg)
2385 {
2386         struct net *net = cfg->fc_nlinfo.nl_net;
2387         bool ecn_ca = false;
2388         struct nlattr *nla;
2389         int remaining;
2390         u32 *mp;
2391
2392         if (!cfg->fc_mx)
2393                 return 0;
2394
2395         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2396         if (unlikely(!mp))
2397                 return -ENOMEM;
2398
2399         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2400                 int type = nla_type(nla);
2401                 u32 val;
2402
2403                 if (!type)
2404                         continue;
2405                 if (unlikely(type > RTAX_MAX))
2406                         goto err;
2407
2408                 if (type == RTAX_CC_ALGO) {
2409                         char tmp[TCP_CA_NAME_MAX];
2410
2411                         nla_strlcpy(tmp, nla, sizeof(tmp));
2412                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2413                         if (val == TCP_CA_UNSPEC)
2414                                 goto err;
2415                 } else {
2416                         val = nla_get_u32(nla);
2417                 }
2418                 if (type == RTAX_HOPLIMIT && val > 255)
2419                         val = 255;
2420                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2421                         goto err;
2422
2423                 mp[type - 1] = val;
2424                 __set_bit(type - 1, mxc->mx_valid);
2425         }
2426
2427         if (ecn_ca) {
2428                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2429                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2430         }
2431
2432         mxc->mx = mp;
2433         return 0;
2434  err:
2435         kfree(mp);
2436         return -EINVAL;
2437 }
2438
2439 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2440                                             struct fib6_config *cfg,
2441                                             const struct in6_addr *gw_addr)
2442 {
2443         struct flowi6 fl6 = {
2444                 .flowi6_oif = cfg->fc_ifindex,
2445                 .daddr = *gw_addr,
2446                 .saddr = cfg->fc_prefsrc,
2447         };
2448         struct fib6_table *table;
2449         struct rt6_info *rt;
2450         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2451
2452         table = fib6_get_table(net, cfg->fc_table);
2453         if (!table)
2454                 return NULL;
2455
2456         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2457                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2458
2459         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2460
2461         /* if table lookup failed, fall back to full lookup */
2462         if (rt == net->ipv6.ip6_null_entry) {
2463                 ip6_rt_put(rt);
2464                 rt = NULL;
2465         }
2466
2467         return rt;
2468 }
2469
2470 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2471                                               struct netlink_ext_ack *extack)
2472 {
2473         struct net *net = cfg->fc_nlinfo.nl_net;
2474         struct rt6_info *rt = NULL;
2475         struct net_device *dev = NULL;
2476         struct inet6_dev *idev = NULL;
2477         struct fib6_table *table;
2478         int addr_type;
2479         int err = -EINVAL;
2480
2481         /* RTF_PCPU is an internal flag; can not be set by userspace */
2482         if (cfg->fc_flags & RTF_PCPU) {
2483                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2484                 goto out;
2485         }
2486
2487         /* RTF_CACHE is an internal flag; can not be set by userspace */
2488         if (cfg->fc_flags & RTF_CACHE) {
2489                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2490                 goto out;
2491         }
2492
2493         if (cfg->fc_dst_len > 128) {
2494                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2495                 goto out;
2496         }
2497         if (cfg->fc_src_len > 128) {
2498                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2499                 goto out;
2500         }
2501 #ifndef CONFIG_IPV6_SUBTREES
2502         if (cfg->fc_src_len) {
2503                 NL_SET_ERR_MSG(extack,
2504                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2505                 goto out;
2506         }
2507 #endif
2508         if (cfg->fc_ifindex) {
2509                 err = -ENODEV;
2510                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2511                 if (!dev)
2512                         goto out;
2513                 idev = in6_dev_get(dev);
2514                 if (!idev)
2515                         goto out;
2516         }
2517
2518         if (cfg->fc_metric == 0)
2519                 cfg->fc_metric = IP6_RT_PRIO_USER;
2520
2521         err = -ENOBUFS;
2522         if (cfg->fc_nlinfo.nlh &&
2523             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2524                 table = fib6_get_table(net, cfg->fc_table);
2525                 if (!table) {
2526                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2527                         table = fib6_new_table(net, cfg->fc_table);
2528                 }
2529         } else {
2530                 table = fib6_new_table(net, cfg->fc_table);
2531         }
2532
2533         if (!table)
2534                 goto out;
2535
2536         rt = ip6_dst_alloc(net, NULL,
2537                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2538
2539         if (!rt) {
2540                 err = -ENOMEM;
2541                 goto out;
2542         }
2543
2544         if (cfg->fc_flags & RTF_EXPIRES)
2545                 rt6_set_expires(rt, jiffies +
2546                                 clock_t_to_jiffies(cfg->fc_expires));
2547         else
2548                 rt6_clean_expires(rt);
2549
2550         if (cfg->fc_protocol == RTPROT_UNSPEC)
2551                 cfg->fc_protocol = RTPROT_BOOT;
2552         rt->rt6i_protocol = cfg->fc_protocol;
2553
2554         addr_type = ipv6_addr_type(&cfg->fc_dst);
2555
2556         if (addr_type & IPV6_ADDR_MULTICAST)
2557                 rt->dst.input = ip6_mc_input;
2558         else if (cfg->fc_flags & RTF_LOCAL)
2559                 rt->dst.input = ip6_input;
2560         else
2561                 rt->dst.input = ip6_forward;
2562
2563         rt->dst.output = ip6_output;
2564
2565         if (cfg->fc_encap) {
2566                 struct lwtunnel_state *lwtstate;
2567
2568                 err = lwtunnel_build_state(cfg->fc_encap_type,
2569                                            cfg->fc_encap, AF_INET6, cfg,
2570                                            &lwtstate, extack);
2571                 if (err)
2572                         goto out;
2573                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2574                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2575                         rt->dst.lwtstate->orig_output = rt->dst.output;
2576                         rt->dst.output = lwtunnel_output;
2577                 }
2578                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2579                         rt->dst.lwtstate->orig_input = rt->dst.input;
2580                         rt->dst.input = lwtunnel_input;
2581                 }
2582         }
2583
2584         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2585         rt->rt6i_dst.plen = cfg->fc_dst_len;
2586         if (rt->rt6i_dst.plen == 128)
2587                 rt->dst.flags |= DST_HOST;
2588
2589 #ifdef CONFIG_IPV6_SUBTREES
2590         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2591         rt->rt6i_src.plen = cfg->fc_src_len;
2592 #endif
2593
2594         rt->rt6i_metric = cfg->fc_metric;
2595
2596         /* We cannot add true routes via loopback here,
2597            they would result in kernel looping; promote them to reject routes
2598          */
2599         if ((cfg->fc_flags & RTF_REJECT) ||
2600             (dev && (dev->flags & IFF_LOOPBACK) &&
2601              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2602              !(cfg->fc_flags & RTF_LOCAL))) {
2603                 /* hold loopback dev/idev if we haven't done so. */
2604                 if (dev != net->loopback_dev) {
2605                         if (dev) {
2606                                 dev_put(dev);
2607                                 in6_dev_put(idev);
2608                         }
2609                         dev = net->loopback_dev;
2610                         dev_hold(dev);
2611                         idev = in6_dev_get(dev);
2612                         if (!idev) {
2613                                 err = -ENODEV;
2614                                 goto out;
2615                         }
2616                 }
2617                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2618                 switch (cfg->fc_type) {
2619                 case RTN_BLACKHOLE:
2620                         rt->dst.error = -EINVAL;
2621                         rt->dst.output = dst_discard_out;
2622                         rt->dst.input = dst_discard;
2623                         break;
2624                 case RTN_PROHIBIT:
2625                         rt->dst.error = -EACCES;
2626                         rt->dst.output = ip6_pkt_prohibit_out;
2627                         rt->dst.input = ip6_pkt_prohibit;
2628                         break;
2629                 case RTN_THROW:
2630                 case RTN_UNREACHABLE:
2631                 default:
2632                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2633                                         : (cfg->fc_type == RTN_UNREACHABLE)
2634                                         ? -EHOSTUNREACH : -ENETUNREACH;
2635                         rt->dst.output = ip6_pkt_discard_out;
2636                         rt->dst.input = ip6_pkt_discard;
2637                         break;
2638                 }
2639                 goto install_route;
2640         }
2641
2642         if (cfg->fc_flags & RTF_GATEWAY) {
2643                 const struct in6_addr *gw_addr;
2644                 int gwa_type;
2645
2646                 gw_addr = &cfg->fc_gateway;
2647                 gwa_type = ipv6_addr_type(gw_addr);
2648
2649                 /* if gw_addr is local we will fail to detect this in case
2650                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2651                  * will return already-added prefix route via interface that
2652                  * prefix route was assigned to, which might be non-loopback.
2653                  */
2654                 err = -EINVAL;
2655                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2656                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2657                                             dev : NULL, 0, 0)) {
2658                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2659                         goto out;
2660                 }
2661                 rt->rt6i_gateway = *gw_addr;
2662
2663                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2664                         struct rt6_info *grt = NULL;
2665
2666                         /* IPv6 strictly inhibits using not link-local
2667                            addresses as nexthop address.
2668                            Otherwise, router will not able to send redirects.
2669                            It is very good, but in some (rare!) circumstances
2670                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2671                            some exceptions. --ANK
2672                            We allow IPv4-mapped nexthops to support RFC4798-type
2673                            addressing
2674                          */
2675                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2676                                           IPV6_ADDR_MAPPED))) {
2677                                 NL_SET_ERR_MSG(extack,
2678                                                "Invalid gateway address");
2679                                 goto out;
2680                         }
2681
2682                         if (cfg->fc_table) {
2683                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2684
2685                                 if (grt) {
2686                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2687                                             (dev && dev != grt->dst.dev)) {
2688                                                 ip6_rt_put(grt);
2689                                                 grt = NULL;
2690                                         }
2691                                 }
2692                         }
2693
2694                         if (!grt)
2695                                 grt = rt6_lookup(net, gw_addr, NULL,
2696                                                  cfg->fc_ifindex, 1);
2697
2698                         err = -EHOSTUNREACH;
2699                         if (!grt)
2700                                 goto out;
2701                         if (dev) {
2702                                 if (dev != grt->dst.dev) {
2703                                         ip6_rt_put(grt);
2704                                         goto out;
2705                                 }
2706                         } else {
2707                                 dev = grt->dst.dev;
2708                                 idev = grt->rt6i_idev;
2709                                 dev_hold(dev);
2710                                 in6_dev_hold(grt->rt6i_idev);
2711                         }
2712                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2713                                 err = 0;
2714                         ip6_rt_put(grt);
2715
2716                         if (err)
2717                                 goto out;
2718                 }
2719                 err = -EINVAL;
2720                 if (!dev) {
2721                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2722                         goto out;
2723                 } else if (dev->flags & IFF_LOOPBACK) {
2724                         NL_SET_ERR_MSG(extack,
2725                                        "Egress device can not be loopback device for this route");
2726                         goto out;
2727                 }
2728         }
2729
2730         err = -ENODEV;
2731         if (!dev)
2732                 goto out;
2733
2734         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2735                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2736                         NL_SET_ERR_MSG(extack, "Invalid source address");
2737                         err = -EINVAL;
2738                         goto out;
2739                 }
2740                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2741                 rt->rt6i_prefsrc.plen = 128;
2742         } else
2743                 rt->rt6i_prefsrc.plen = 0;
2744
2745         rt->rt6i_flags = cfg->fc_flags;
2746
2747 install_route:
2748         rt->dst.dev = dev;
2749         rt->rt6i_idev = idev;
2750         rt->rt6i_table = table;
2751
2752         cfg->fc_nlinfo.nl_net = dev_net(dev);
2753
2754         return rt;
2755 out:
2756         if (dev)
2757                 dev_put(dev);
2758         if (idev)
2759                 in6_dev_put(idev);
2760         if (rt)
2761                 dst_release_immediate(&rt->dst);
2762
2763         return ERR_PTR(err);
2764 }
2765
2766 int ip6_route_add(struct fib6_config *cfg,
2767                   struct netlink_ext_ack *extack)
2768 {
2769         struct mx6_config mxc = { .mx = NULL, };
2770         struct rt6_info *rt;
2771         int err;
2772
2773         rt = ip6_route_info_create(cfg, extack);
2774         if (IS_ERR(rt)) {
2775                 err = PTR_ERR(rt);
2776                 rt = NULL;
2777                 goto out;
2778         }
2779
2780         err = ip6_convert_metrics(&mxc, cfg);
2781         if (err)
2782                 goto out;
2783
2784         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2785
2786         kfree(mxc.mx);
2787
2788         return err;
2789 out:
2790         if (rt)
2791                 dst_release_immediate(&rt->dst);
2792
2793         return err;
2794 }
2795
2796 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2797 {
2798         int err;
2799         struct fib6_table *table;
2800         struct net *net = dev_net(rt->dst.dev);
2801
2802         if (rt == net->ipv6.ip6_null_entry) {
2803                 err = -ENOENT;
2804                 goto out;
2805         }
2806
2807         table = rt->rt6i_table;
2808         spin_lock_bh(&table->tb6_lock);
2809         err = fib6_del(rt, info);
2810         spin_unlock_bh(&table->tb6_lock);
2811
2812 out:
2813         ip6_rt_put(rt);
2814         return err;
2815 }
2816
2817 int ip6_del_rt(struct rt6_info *rt)
2818 {
2819         struct nl_info info = {
2820                 .nl_net = dev_net(rt->dst.dev),
2821         };
2822         return __ip6_del_rt(rt, &info);
2823 }
2824
2825 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2826 {
2827         struct nl_info *info = &cfg->fc_nlinfo;
2828         struct net *net = info->nl_net;
2829         struct sk_buff *skb = NULL;
2830         struct fib6_table *table;
2831         int err = -ENOENT;
2832
2833         if (rt == net->ipv6.ip6_null_entry)
2834                 goto out_put;
2835         table = rt->rt6i_table;
2836         spin_lock_bh(&table->tb6_lock);
2837
2838         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2839                 struct rt6_info *sibling, *next_sibling;
2840
2841                 /* prefer to send a single notification with all hops */
2842                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2843                 if (skb) {
2844                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2845
2846                         if (rt6_fill_node(net, skb, rt,
2847                                           NULL, NULL, 0, RTM_DELROUTE,
2848                                           info->portid, seq, 0) < 0) {
2849                                 kfree_skb(skb);
2850                                 skb = NULL;
2851                         } else
2852                                 info->skip_notify = 1;
2853                 }
2854
2855                 list_for_each_entry_safe(sibling, next_sibling,
2856                                          &rt->rt6i_siblings,
2857                                          rt6i_siblings) {
2858                         err = fib6_del(sibling, info);
2859                         if (err)
2860                                 goto out_unlock;
2861                 }
2862         }
2863
2864         err = fib6_del(rt, info);
2865 out_unlock:
2866         spin_unlock_bh(&table->tb6_lock);
2867 out_put:
2868         ip6_rt_put(rt);
2869
2870         if (skb) {
2871                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2872                             info->nlh, gfp_any());
2873         }
2874         return err;
2875 }
2876
2877 static int ip6_route_del(struct fib6_config *cfg,
2878                          struct netlink_ext_ack *extack)
2879 {
2880         struct rt6_info *rt, *rt_cache;
2881         struct fib6_table *table;
2882         struct fib6_node *fn;
2883         int err = -ESRCH;
2884
2885         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2886         if (!table) {
2887                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2888                 return err;
2889         }
2890
2891         rcu_read_lock();
2892
2893         fn = fib6_locate(&table->tb6_root,
2894                          &cfg->fc_dst, cfg->fc_dst_len,
2895                          &cfg->fc_src, cfg->fc_src_len,
2896                          !(cfg->fc_flags & RTF_CACHE));
2897
2898         if (fn) {
2899                 for_each_fib6_node_rt_rcu(fn) {
2900                         if (cfg->fc_flags & RTF_CACHE) {
2901                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2902                                                               &cfg->fc_src);
2903                                 if (!rt_cache)
2904                                         continue;
2905                                 rt = rt_cache;
2906                         }
2907                         if (cfg->fc_ifindex &&
2908                             (!rt->dst.dev ||
2909                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2910                                 continue;
2911                         if (cfg->fc_flags & RTF_GATEWAY &&
2912                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2913                                 continue;
2914                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2915                                 continue;
2916                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2917                                 continue;
2918                         if (!dst_hold_safe(&rt->dst))
2919                                 break;
2920                         rcu_read_unlock();
2921
2922                         /* if gateway was specified only delete the one hop */
2923                         if (cfg->fc_flags & RTF_GATEWAY)
2924                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2925
2926                         return __ip6_del_rt_siblings(rt, cfg);
2927                 }
2928         }
2929         rcu_read_unlock();
2930
2931         return err;
2932 }
2933
2934 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2935 {
2936         struct netevent_redirect netevent;
2937         struct rt6_info *rt, *nrt = NULL;
2938         struct ndisc_options ndopts;
2939         struct inet6_dev *in6_dev;
2940         struct neighbour *neigh;
2941         struct rd_msg *msg;
2942         int optlen, on_link;
2943         u8 *lladdr;
2944
2945         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2946         optlen -= sizeof(*msg);
2947
2948         if (optlen < 0) {
2949                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2950                 return;
2951         }
2952
2953         msg = (struct rd_msg *)icmp6_hdr(skb);
2954
2955         if (ipv6_addr_is_multicast(&msg->dest)) {
2956                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2957                 return;
2958         }
2959
2960         on_link = 0;
2961         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2962                 on_link = 1;
2963         } else if (ipv6_addr_type(&msg->target) !=
2964                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2965                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2966                 return;
2967         }
2968
2969         in6_dev = __in6_dev_get(skb->dev);
2970         if (!in6_dev)
2971                 return;
2972         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2973                 return;
2974
2975         /* RFC2461 8.1:
2976          *      The IP source address of the Redirect MUST be the same as the current
2977          *      first-hop router for the specified ICMP Destination Address.
2978          */
2979
2980         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2981                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2982                 return;
2983         }
2984
2985         lladdr = NULL;
2986         if (ndopts.nd_opts_tgt_lladdr) {
2987                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2988                                              skb->dev);
2989                 if (!lladdr) {
2990                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2991                         return;
2992                 }
2993         }
2994
2995         rt = (struct rt6_info *) dst;
2996         if (rt->rt6i_flags & RTF_REJECT) {
2997                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2998                 return;
2999         }
3000
3001         /* Redirect received -> path was valid.
3002          * Look, redirects are sent only in response to data packets,
3003          * so that this nexthop apparently is reachable. --ANK
3004          */
3005         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3006
3007         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3008         if (!neigh)
3009                 return;
3010
3011         /*
3012          *      We have finally decided to accept it.
3013          */
3014
3015         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3016                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3017                      NEIGH_UPDATE_F_OVERRIDE|
3018                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3019                                      NEIGH_UPDATE_F_ISROUTER)),
3020                      NDISC_REDIRECT, &ndopts);
3021
3022         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3023         if (!nrt)
3024                 goto out;
3025
3026         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3027         if (on_link)
3028                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3029
3030         nrt->rt6i_protocol = RTPROT_REDIRECT;
3031         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3032
3033         /* No need to remove rt from the exception table if rt is
3034          * a cached route because rt6_insert_exception() will
3035          * takes care of it
3036          */
3037         if (rt6_insert_exception(nrt, rt)) {
3038                 dst_release_immediate(&nrt->dst);
3039                 goto out;
3040         }
3041
3042         netevent.old = &rt->dst;
3043         netevent.new = &nrt->dst;
3044         netevent.daddr = &msg->dest;
3045         netevent.neigh = neigh;
3046         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3047
3048 out:
3049         neigh_release(neigh);
3050 }
3051
3052 /*
3053  *      Misc support functions
3054  */
3055
3056 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3057 {
3058         BUG_ON(from->dst.from);
3059
3060         rt->rt6i_flags &= ~RTF_EXPIRES;
3061         dst_hold(&from->dst);
3062         rt->dst.from = &from->dst;
3063         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3064 }
3065
3066 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3067 {
3068         rt->dst.input = ort->dst.input;
3069         rt->dst.output = ort->dst.output;
3070         rt->rt6i_dst = ort->rt6i_dst;
3071         rt->dst.error = ort->dst.error;
3072         rt->rt6i_idev = ort->rt6i_idev;
3073         if (rt->rt6i_idev)
3074                 in6_dev_hold(rt->rt6i_idev);
3075         rt->dst.lastuse = jiffies;
3076         rt->rt6i_gateway = ort->rt6i_gateway;
3077         rt->rt6i_flags = ort->rt6i_flags;
3078         rt6_set_from(rt, ort);
3079         rt->rt6i_metric = ort->rt6i_metric;
3080 #ifdef CONFIG_IPV6_SUBTREES
3081         rt->rt6i_src = ort->rt6i_src;
3082 #endif
3083         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3084         rt->rt6i_table = ort->rt6i_table;
3085         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3086 }
3087
3088 #ifdef CONFIG_IPV6_ROUTE_INFO
3089 static struct rt6_info *rt6_get_route_info(struct net *net,
3090                                            const struct in6_addr *prefix, int prefixlen,
3091                                            const struct in6_addr *gwaddr,
3092                                            struct net_device *dev)
3093 {
3094         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3095         int ifindex = dev->ifindex;
3096         struct fib6_node *fn;
3097         struct rt6_info *rt = NULL;
3098         struct fib6_table *table;
3099
3100         table = fib6_get_table(net, tb_id);
3101         if (!table)
3102                 return NULL;
3103
3104         rcu_read_lock();
3105         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3106         if (!fn)
3107                 goto out;
3108
3109         for_each_fib6_node_rt_rcu(fn) {
3110                 if (rt->dst.dev->ifindex != ifindex)
3111                         continue;
3112                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3113                         continue;
3114                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3115                         continue;
3116                 ip6_hold_safe(NULL, &rt, false);
3117                 break;
3118         }
3119 out:
3120         rcu_read_unlock();
3121         return rt;
3122 }
3123
3124 static struct rt6_info *rt6_add_route_info(struct net *net,
3125                                            const struct in6_addr *prefix, int prefixlen,
3126                                            const struct in6_addr *gwaddr,
3127                                            struct net_device *dev,
3128                                            unsigned int pref)
3129 {
3130         struct fib6_config cfg = {
3131                 .fc_metric      = IP6_RT_PRIO_USER,
3132                 .fc_ifindex     = dev->ifindex,
3133                 .fc_dst_len     = prefixlen,
3134                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3135                                   RTF_UP | RTF_PREF(pref),
3136                 .fc_protocol = RTPROT_RA,
3137                 .fc_nlinfo.portid = 0,
3138                 .fc_nlinfo.nlh = NULL,
3139                 .fc_nlinfo.nl_net = net,
3140         };
3141
3142         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3143         cfg.fc_dst = *prefix;
3144         cfg.fc_gateway = *gwaddr;
3145
3146         /* We should treat it as a default route if prefix length is 0. */
3147         if (!prefixlen)
3148                 cfg.fc_flags |= RTF_DEFAULT;
3149
3150         ip6_route_add(&cfg, NULL);
3151
3152         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3153 }
3154 #endif
3155
3156 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3157 {
3158         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3159         struct rt6_info *rt;
3160         struct fib6_table *table;
3161
3162         table = fib6_get_table(dev_net(dev), tb_id);
3163         if (!table)
3164                 return NULL;
3165
3166         rcu_read_lock();
3167         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3168                 if (dev == rt->dst.dev &&
3169                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3170                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3171                         break;
3172         }
3173         if (rt)
3174                 ip6_hold_safe(NULL, &rt, false);
3175         rcu_read_unlock();
3176         return rt;
3177 }
3178
3179 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3180                                      struct net_device *dev,
3181                                      unsigned int pref)
3182 {
3183         struct fib6_config cfg = {
3184                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3185                 .fc_metric      = IP6_RT_PRIO_USER,
3186                 .fc_ifindex     = dev->ifindex,
3187                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3188                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3189                 .fc_protocol = RTPROT_RA,
3190                 .fc_nlinfo.portid = 0,
3191                 .fc_nlinfo.nlh = NULL,
3192                 .fc_nlinfo.nl_net = dev_net(dev),
3193         };
3194
3195         cfg.fc_gateway = *gwaddr;
3196
3197         if (!ip6_route_add(&cfg, NULL)) {
3198                 struct fib6_table *table;
3199
3200                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3201                 if (table)
3202                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3203         }
3204
3205         return rt6_get_dflt_router(gwaddr, dev);
3206 }
3207
3208 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3209 {
3210         struct rt6_info *rt;
3211
3212 restart:
3213         rcu_read_lock();
3214         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3215                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3216                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3217                         if (dst_hold_safe(&rt->dst)) {
3218                                 rcu_read_unlock();
3219                                 ip6_del_rt(rt);
3220                         } else {
3221                                 rcu_read_unlock();
3222                         }
3223                         goto restart;
3224                 }
3225         }
3226         rcu_read_unlock();
3227
3228         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3229 }
3230
3231 void rt6_purge_dflt_routers(struct net *net)
3232 {
3233         struct fib6_table *table;
3234         struct hlist_head *head;
3235         unsigned int h;
3236
3237         rcu_read_lock();
3238
3239         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3240                 head = &net->ipv6.fib_table_hash[h];
3241                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3242                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3243                                 __rt6_purge_dflt_routers(table);
3244                 }
3245         }
3246
3247         rcu_read_unlock();
3248 }
3249
3250 static void rtmsg_to_fib6_config(struct net *net,
3251                                  struct in6_rtmsg *rtmsg,
3252                                  struct fib6_config *cfg)
3253 {
3254         memset(cfg, 0, sizeof(*cfg));
3255
3256         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3257                          : RT6_TABLE_MAIN;
3258         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3259         cfg->fc_metric = rtmsg->rtmsg_metric;
3260         cfg->fc_expires = rtmsg->rtmsg_info;
3261         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3262         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3263         cfg->fc_flags = rtmsg->rtmsg_flags;
3264
3265         cfg->fc_nlinfo.nl_net = net;
3266
3267         cfg->fc_dst = rtmsg->rtmsg_dst;
3268         cfg->fc_src = rtmsg->rtmsg_src;
3269         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3270 }
3271
3272 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3273 {
3274         struct fib6_config cfg;
3275         struct in6_rtmsg rtmsg;
3276         int err;
3277
3278         switch (cmd) {
3279         case SIOCADDRT:         /* Add a route */
3280         case SIOCDELRT:         /* Delete a route */
3281                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3282                         return -EPERM;
3283                 err = copy_from_user(&rtmsg, arg,
3284                                      sizeof(struct in6_rtmsg));
3285                 if (err)
3286                         return -EFAULT;
3287
3288                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3289
3290                 rtnl_lock();
3291                 switch (cmd) {
3292                 case SIOCADDRT:
3293                         err = ip6_route_add(&cfg, NULL);
3294                         break;
3295                 case SIOCDELRT:
3296                         err = ip6_route_del(&cfg, NULL);
3297                         break;
3298                 default:
3299                         err = -EINVAL;
3300                 }
3301                 rtnl_unlock();
3302
3303                 return err;
3304         }
3305
3306         return -EINVAL;
3307 }
3308
3309 /*
3310  *      Drop the packet on the floor
3311  */
3312
3313 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3314 {
3315         int type;
3316         struct dst_entry *dst = skb_dst(skb);
3317         switch (ipstats_mib_noroutes) {
3318         case IPSTATS_MIB_INNOROUTES:
3319                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3320                 if (type == IPV6_ADDR_ANY) {
3321                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3322                                       IPSTATS_MIB_INADDRERRORS);
3323                         break;
3324                 }
3325                 /* FALLTHROUGH */
3326         case IPSTATS_MIB_OUTNOROUTES:
3327                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3328                               ipstats_mib_noroutes);
3329                 break;
3330         }
3331         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3332         kfree_skb(skb);
3333         return 0;
3334 }
3335
3336 static int ip6_pkt_discard(struct sk_buff *skb)
3337 {
3338         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3339 }
3340
3341 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3342 {
3343         skb->dev = skb_dst(skb)->dev;
3344         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3345 }
3346
3347 static int ip6_pkt_prohibit(struct sk_buff *skb)
3348 {
3349         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3350 }
3351
3352 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3353 {
3354         skb->dev = skb_dst(skb)->dev;
3355         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3356 }
3357
3358 /*
3359  *      Allocate a dst for local (unicast / anycast) address.
3360  */
3361
3362 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3363                                     const struct in6_addr *addr,
3364                                     bool anycast)
3365 {
3366         u32 tb_id;
3367         struct net *net = dev_net(idev->dev);
3368         struct net_device *dev = idev->dev;
3369         struct rt6_info *rt;
3370
3371         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3372         if (!rt)
3373                 return ERR_PTR(-ENOMEM);
3374
3375         in6_dev_hold(idev);
3376
3377         rt->dst.flags |= DST_HOST;
3378         rt->dst.input = ip6_input;
3379         rt->dst.output = ip6_output;
3380         rt->rt6i_idev = idev;
3381
3382         rt->rt6i_protocol = RTPROT_KERNEL;
3383         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3384         if (anycast)
3385                 rt->rt6i_flags |= RTF_ANYCAST;
3386         else
3387                 rt->rt6i_flags |= RTF_LOCAL;
3388
3389         rt->rt6i_gateway  = *addr;
3390         rt->rt6i_dst.addr = *addr;
3391         rt->rt6i_dst.plen = 128;
3392         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3393         rt->rt6i_table = fib6_get_table(net, tb_id);
3394
3395         return rt;
3396 }
3397
3398 /* remove deleted ip from prefsrc entries */
3399 struct arg_dev_net_ip {
3400         struct net_device *dev;
3401         struct net *net;
3402         struct in6_addr *addr;
3403 };
3404
3405 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3406 {
3407         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3408         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3409         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3410
3411         if (((void *)rt->dst.dev == dev || !dev) &&
3412             rt != net->ipv6.ip6_null_entry &&
3413             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3414                 spin_lock_bh(&rt6_exception_lock);
3415                 /* remove prefsrc entry */
3416                 rt->rt6i_prefsrc.plen = 0;
3417                 /* need to update cache as well */
3418                 rt6_exceptions_remove_prefsrc(rt);
3419                 spin_unlock_bh(&rt6_exception_lock);
3420         }
3421         return 0;
3422 }
3423
3424 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3425 {
3426         struct net *net = dev_net(ifp->idev->dev);
3427         struct arg_dev_net_ip adni = {
3428                 .dev = ifp->idev->dev,
3429                 .net = net,
3430                 .addr = &ifp->addr,
3431         };
3432         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3433 }
3434
3435 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3436
3437 /* Remove routers and update dst entries when gateway turn into host. */
3438 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3439 {
3440         struct in6_addr *gateway = (struct in6_addr *)arg;
3441
3442         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3443             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3444                 return -1;
3445         }
3446
3447         /* Further clean up cached routes in exception table.
3448          * This is needed because cached route may have a different
3449          * gateway than its 'parent' in the case of an ip redirect.
3450          */
3451         rt6_exceptions_clean_tohost(rt, gateway);
3452
3453         return 0;
3454 }
3455
3456 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3457 {
3458         fib6_clean_all(net, fib6_clean_tohost, gateway);
3459 }
3460
3461 struct arg_dev_net {
3462         struct net_device *dev;
3463         struct net *net;
3464 };
3465
3466 /* called with write lock held for table with rt */
3467 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3468 {
3469         const struct arg_dev_net *adn = arg;
3470         const struct net_device *dev = adn->dev;
3471
3472         if ((rt->dst.dev == dev || !dev) &&
3473             rt != adn->net->ipv6.ip6_null_entry &&
3474             (rt->rt6i_nsiblings == 0 ||
3475              (dev && netdev_unregistering(dev)) ||
3476              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3477                 return -1;
3478
3479         return 0;
3480 }
3481
3482 void rt6_ifdown(struct net *net, struct net_device *dev)
3483 {
3484         struct arg_dev_net adn = {
3485                 .dev = dev,
3486                 .net = net,
3487         };
3488
3489         fib6_clean_all(net, fib6_ifdown, &adn);
3490         if (dev)
3491                 rt6_uncached_list_flush_dev(net, dev);
3492 }
3493
3494 struct rt6_mtu_change_arg {
3495         struct net_device *dev;
3496         unsigned int mtu;
3497 };
3498
3499 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3500 {
3501         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3502         struct inet6_dev *idev;
3503
3504         /* In IPv6 pmtu discovery is not optional,
3505            so that RTAX_MTU lock cannot disable it.
3506            We still use this lock to block changes
3507            caused by addrconf/ndisc.
3508         */
3509
3510         idev = __in6_dev_get(arg->dev);
3511         if (!idev)
3512                 return 0;
3513
3514         /* For administrative MTU increase, there is no way to discover
3515            IPv6 PMTU increase, so PMTU increase should be updated here.
3516            Since RFC 1981 doesn't include administrative MTU increase
3517            update PMTU increase is a MUST. (i.e. jumbo frame)
3518          */
3519         /*
3520            If new MTU is less than route PMTU, this new MTU will be the
3521            lowest MTU in the path, update the route PMTU to reflect PMTU
3522            decreases; if new MTU is greater than route PMTU, and the
3523            old MTU is the lowest MTU in the path, update the route PMTU
3524            to reflect the increase. In this case if the other nodes' MTU
3525            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3526            PMTU discovery.
3527          */
3528         if (rt->dst.dev == arg->dev &&
3529             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3530             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3531                 spin_lock_bh(&rt6_exception_lock);
3532                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3533                     (dst_mtu(&rt->dst) < arg->mtu &&
3534                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3535                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3536                 }
3537                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3538                 spin_unlock_bh(&rt6_exception_lock);
3539         }
3540         return 0;
3541 }
3542
3543 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3544 {
3545         struct rt6_mtu_change_arg arg = {
3546                 .dev = dev,
3547                 .mtu = mtu,
3548         };
3549
3550         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3551 }
3552
3553 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3554         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3555         [RTA_OIF]               = { .type = NLA_U32 },
3556         [RTA_IIF]               = { .type = NLA_U32 },
3557         [RTA_PRIORITY]          = { .type = NLA_U32 },
3558         [RTA_METRICS]           = { .type = NLA_NESTED },
3559         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3560         [RTA_PREF]              = { .type = NLA_U8 },
3561         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3562         [RTA_ENCAP]             = { .type = NLA_NESTED },
3563         [RTA_EXPIRES]           = { .type = NLA_U32 },
3564         [RTA_UID]               = { .type = NLA_U32 },
3565         [RTA_MARK]              = { .type = NLA_U32 },
3566 };
3567
3568 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3569                               struct fib6_config *cfg,
3570                               struct netlink_ext_ack *extack)
3571 {
3572         struct rtmsg *rtm;
3573         struct nlattr *tb[RTA_MAX+1];
3574         unsigned int pref;
3575         int err;
3576
3577         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3578                           NULL);
3579         if (err < 0)
3580                 goto errout;
3581
3582         err = -EINVAL;
3583         rtm = nlmsg_data(nlh);
3584         memset(cfg, 0, sizeof(*cfg));
3585
3586         cfg->fc_table = rtm->rtm_table;
3587         cfg->fc_dst_len = rtm->rtm_dst_len;
3588         cfg->fc_src_len = rtm->rtm_src_len;
3589         cfg->fc_flags = RTF_UP;
3590         cfg->fc_protocol = rtm->rtm_protocol;
3591         cfg->fc_type = rtm->rtm_type;
3592
3593         if (rtm->rtm_type == RTN_UNREACHABLE ||
3594             rtm->rtm_type == RTN_BLACKHOLE ||
3595             rtm->rtm_type == RTN_PROHIBIT ||
3596             rtm->rtm_type == RTN_THROW)
3597                 cfg->fc_flags |= RTF_REJECT;
3598
3599         if (rtm->rtm_type == RTN_LOCAL)
3600                 cfg->fc_flags |= RTF_LOCAL;
3601
3602         if (rtm->rtm_flags & RTM_F_CLONED)
3603                 cfg->fc_flags |= RTF_CACHE;
3604
3605         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3606         cfg->fc_nlinfo.nlh = nlh;
3607         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3608
3609         if (tb[RTA_GATEWAY]) {
3610                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3611                 cfg->fc_flags |= RTF_GATEWAY;
3612         }
3613
3614         if (tb[RTA_DST]) {
3615                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3616
3617                 if (nla_len(tb[RTA_DST]) < plen)
3618                         goto errout;
3619
3620                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3621         }
3622
3623         if (tb[RTA_SRC]) {
3624                 int plen = (rtm->rtm_src_len + 7) >> 3;
3625
3626                 if (nla_len(tb[RTA_SRC]) < plen)
3627                         goto errout;
3628
3629                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3630         }
3631
3632         if (tb[RTA_PREFSRC])
3633                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3634
3635         if (tb[RTA_OIF])
3636                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3637
3638         if (tb[RTA_PRIORITY])
3639                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3640
3641         if (tb[RTA_METRICS]) {
3642                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3643                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3644         }
3645
3646         if (tb[RTA_TABLE])
3647                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3648
3649         if (tb[RTA_MULTIPATH]) {
3650                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3651                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3652
3653                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3654                                                      cfg->fc_mp_len, extack);
3655                 if (err < 0)
3656                         goto errout;
3657         }
3658
3659         if (tb[RTA_PREF]) {
3660                 pref = nla_get_u8(tb[RTA_PREF]);
3661                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3662                     pref != ICMPV6_ROUTER_PREF_HIGH)
3663                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3664                 cfg->fc_flags |= RTF_PREF(pref);
3665         }
3666
3667         if (tb[RTA_ENCAP])
3668                 cfg->fc_encap = tb[RTA_ENCAP];
3669
3670         if (tb[RTA_ENCAP_TYPE]) {
3671                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3672
3673                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3674                 if (err < 0)
3675                         goto errout;
3676         }
3677
3678         if (tb[RTA_EXPIRES]) {
3679                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3680
3681                 if (addrconf_finite_timeout(timeout)) {
3682                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3683                         cfg->fc_flags |= RTF_EXPIRES;
3684                 }
3685         }
3686
3687         err = 0;
3688 errout:
3689         return err;
3690 }
3691
3692 struct rt6_nh {
3693         struct rt6_info *rt6_info;
3694         struct fib6_config r_cfg;
3695         struct mx6_config mxc;
3696         struct list_head next;
3697 };
3698
3699 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3700 {
3701         struct rt6_nh *nh;
3702
3703         list_for_each_entry(nh, rt6_nh_list, next) {
3704                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3705                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3706                         nh->r_cfg.fc_ifindex);
3707         }
3708 }
3709
3710 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3711                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3712 {
3713         struct rt6_nh *nh;
3714         int err = -EEXIST;
3715
3716         list_for_each_entry(nh, rt6_nh_list, next) {
3717                 /* check if rt6_info already exists */
3718                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3719                         return err;
3720         }
3721
3722         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3723         if (!nh)
3724                 return -ENOMEM;
3725         nh->rt6_info = rt;
3726         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3727         if (err) {
3728                 kfree(nh);
3729                 return err;
3730         }
3731         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3732         list_add_tail(&nh->next, rt6_nh_list);
3733
3734         return 0;
3735 }
3736
3737 static void ip6_route_mpath_notify(struct rt6_info *rt,
3738                                    struct rt6_info *rt_last,
3739                                    struct nl_info *info,
3740                                    __u16 nlflags)
3741 {
3742         /* if this is an APPEND route, then rt points to the first route
3743          * inserted and rt_last points to last route inserted. Userspace
3744          * wants a consistent dump of the route which starts at the first
3745          * nexthop. Since sibling routes are always added at the end of
3746          * the list, find the first sibling of the last route appended
3747          */
3748         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3749                 rt = list_first_entry(&rt_last->rt6i_siblings,
3750                                       struct rt6_info,
3751                                       rt6i_siblings);
3752         }
3753
3754         if (rt)
3755                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3756 }
3757
3758 static int ip6_route_multipath_add(struct fib6_config *cfg,
3759                                    struct netlink_ext_ack *extack)
3760 {
3761         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3762         struct nl_info *info = &cfg->fc_nlinfo;
3763         struct fib6_config r_cfg;
3764         struct rtnexthop *rtnh;
3765         struct rt6_info *rt;
3766         struct rt6_nh *err_nh;
3767         struct rt6_nh *nh, *nh_safe;
3768         __u16 nlflags;
3769         int remaining;
3770         int attrlen;
3771         int err = 1;
3772         int nhn = 0;
3773         int replace = (cfg->fc_nlinfo.nlh &&
3774                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3775         LIST_HEAD(rt6_nh_list);
3776
3777         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3778         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3779                 nlflags |= NLM_F_APPEND;
3780
3781         remaining = cfg->fc_mp_len;
3782         rtnh = (struct rtnexthop *)cfg->fc_mp;
3783
3784         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3785          * rt6_info structs per nexthop
3786          */
3787         while (rtnh_ok(rtnh, remaining)) {
3788                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3789                 if (rtnh->rtnh_ifindex)
3790                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3791
3792                 attrlen = rtnh_attrlen(rtnh);
3793                 if (attrlen > 0) {
3794                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3795
3796                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3797                         if (nla) {
3798                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3799                                 r_cfg.fc_flags |= RTF_GATEWAY;
3800                         }
3801                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3802                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3803                         if (nla)
3804                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3805                 }
3806
3807                 rt = ip6_route_info_create(&r_cfg, extack);
3808                 if (IS_ERR(rt)) {
3809                         err = PTR_ERR(rt);
3810                         rt = NULL;
3811                         goto cleanup;
3812                 }
3813
3814                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3815                 if (err) {
3816                         dst_release_immediate(&rt->dst);
3817                         goto cleanup;
3818                 }
3819
3820                 rtnh = rtnh_next(rtnh, &remaining);
3821         }
3822
3823         /* for add and replace send one notification with all nexthops.
3824          * Skip the notification in fib6_add_rt2node and send one with
3825          * the full route when done
3826          */
3827         info->skip_notify = 1;
3828
3829         err_nh = NULL;
3830         list_for_each_entry(nh, &rt6_nh_list, next) {
3831                 rt_last = nh->rt6_info;
3832                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3833                 /* save reference to first route for notification */
3834                 if (!rt_notif && !err)
3835                         rt_notif = nh->rt6_info;
3836
3837                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3838                 nh->rt6_info = NULL;
3839                 if (err) {
3840                         if (replace && nhn)
3841                                 ip6_print_replace_route_err(&rt6_nh_list);
3842                         err_nh = nh;
3843                         goto add_errout;
3844                 }
3845
3846                 /* Because each route is added like a single route we remove
3847                  * these flags after the first nexthop: if there is a collision,
3848                  * we have already failed to add the first nexthop:
3849                  * fib6_add_rt2node() has rejected it; when replacing, old
3850                  * nexthops have been replaced by first new, the rest should
3851                  * be added to it.
3852                  */
3853                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3854                                                      NLM_F_REPLACE);
3855                 nhn++;
3856         }
3857
3858         /* success ... tell user about new route */
3859         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3860         goto cleanup;
3861
3862 add_errout:
3863         /* send notification for routes that were added so that
3864          * the delete notifications sent by ip6_route_del are
3865          * coherent
3866          */
3867         if (rt_notif)
3868                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3869
3870         /* Delete routes that were already added */
3871         list_for_each_entry(nh, &rt6_nh_list, next) {
3872                 if (err_nh == nh)
3873                         break;
3874                 ip6_route_del(&nh->r_cfg, extack);
3875         }
3876
3877 cleanup:
3878         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3879                 if (nh->rt6_info)
3880                         dst_release_immediate(&nh->rt6_info->dst);
3881                 kfree(nh->mxc.mx);
3882                 list_del(&nh->next);
3883                 kfree(nh);
3884         }
3885
3886         return err;
3887 }
3888
3889 static int ip6_route_multipath_del(struct fib6_config *cfg,
3890                                    struct netlink_ext_ack *extack)
3891 {
3892         struct fib6_config r_cfg;
3893         struct rtnexthop *rtnh;
3894         int remaining;
3895         int attrlen;
3896         int err = 1, last_err = 0;
3897
3898         remaining = cfg->fc_mp_len;
3899         rtnh = (struct rtnexthop *)cfg->fc_mp;
3900
3901         /* Parse a Multipath Entry */
3902         while (rtnh_ok(rtnh, remaining)) {
3903                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3904                 if (rtnh->rtnh_ifindex)
3905                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3906
3907                 attrlen = rtnh_attrlen(rtnh);
3908                 if (attrlen > 0) {
3909                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3910
3911                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3912                         if (nla) {
3913                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3914                                 r_cfg.fc_flags |= RTF_GATEWAY;
3915                         }
3916                 }
3917                 err = ip6_route_del(&r_cfg, extack);
3918                 if (err)
3919                         last_err = err;
3920
3921                 rtnh = rtnh_next(rtnh, &remaining);
3922         }
3923
3924         return last_err;
3925 }
3926
3927 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3928                               struct netlink_ext_ack *extack)
3929 {
3930         struct fib6_config cfg;
3931         int err;
3932
3933         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3934         if (err < 0)
3935                 return err;
3936
3937         if (cfg.fc_mp)
3938                 return ip6_route_multipath_del(&cfg, extack);
3939         else {
3940                 cfg.fc_delete_all_nh = 1;
3941                 return ip6_route_del(&cfg, extack);
3942         }
3943 }
3944
3945 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3946                               struct netlink_ext_ack *extack)
3947 {
3948         struct fib6_config cfg;
3949         int err;
3950
3951         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3952         if (err < 0)
3953                 return err;
3954
3955         if (cfg.fc_mp)
3956                 return ip6_route_multipath_add(&cfg, extack);
3957         else
3958                 return ip6_route_add(&cfg, extack);
3959 }
3960
3961 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3962 {
3963         int nexthop_len = 0;
3964
3965         if (rt->rt6i_nsiblings) {
3966                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3967                             + NLA_ALIGN(sizeof(struct rtnexthop))
3968                             + nla_total_size(16) /* RTA_GATEWAY */
3969                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3970
3971                 nexthop_len *= rt->rt6i_nsiblings;
3972         }
3973
3974         return NLMSG_ALIGN(sizeof(struct rtmsg))
3975                + nla_total_size(16) /* RTA_SRC */
3976                + nla_total_size(16) /* RTA_DST */
3977                + nla_total_size(16) /* RTA_GATEWAY */
3978                + nla_total_size(16) /* RTA_PREFSRC */
3979                + nla_total_size(4) /* RTA_TABLE */
3980                + nla_total_size(4) /* RTA_IIF */
3981                + nla_total_size(4) /* RTA_OIF */
3982                + nla_total_size(4) /* RTA_PRIORITY */
3983                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3984                + nla_total_size(sizeof(struct rta_cacheinfo))
3985                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3986                + nla_total_size(1) /* RTA_PREF */
3987                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3988                + nexthop_len;
3989 }
3990
3991 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3992                             unsigned int *flags, bool skip_oif)
3993 {
3994         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3995                 *flags |= RTNH_F_LINKDOWN;
3996                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3997                         *flags |= RTNH_F_DEAD;
3998         }
3999
4000         if (rt->rt6i_flags & RTF_GATEWAY) {
4001                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4002                         goto nla_put_failure;
4003         }
4004
4005         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4006                 *flags |= RTNH_F_OFFLOAD;
4007
4008         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4009         if (!skip_oif && rt->dst.dev &&
4010             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4011                 goto nla_put_failure;
4012
4013         if (rt->dst.lwtstate &&
4014             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4015                 goto nla_put_failure;
4016
4017         return 0;
4018
4019 nla_put_failure:
4020         return -EMSGSIZE;
4021 }
4022
4023 /* add multipath next hop */
4024 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4025 {
4026         struct rtnexthop *rtnh;
4027         unsigned int flags = 0;
4028
4029         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4030         if (!rtnh)
4031                 goto nla_put_failure;
4032
4033         rtnh->rtnh_hops = 0;
4034         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4035
4036         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4037                 goto nla_put_failure;
4038
4039         rtnh->rtnh_flags = flags;
4040
4041         /* length of rtnetlink header + attributes */
4042         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4043
4044         return 0;
4045
4046 nla_put_failure:
4047         return -EMSGSIZE;
4048 }
4049
4050 static int rt6_fill_node(struct net *net,
4051                          struct sk_buff *skb, struct rt6_info *rt,
4052                          struct in6_addr *dst, struct in6_addr *src,
4053                          int iif, int type, u32 portid, u32 seq,
4054                          unsigned int flags)
4055 {
4056         u32 metrics[RTAX_MAX];
4057         struct rtmsg *rtm;
4058         struct nlmsghdr *nlh;
4059         long expires;
4060         u32 table;
4061
4062         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4063         if (!nlh)
4064                 return -EMSGSIZE;
4065
4066         rtm = nlmsg_data(nlh);
4067         rtm->rtm_family = AF_INET6;
4068         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4069         rtm->rtm_src_len = rt->rt6i_src.plen;
4070         rtm->rtm_tos = 0;
4071         if (rt->rt6i_table)
4072                 table = rt->rt6i_table->tb6_id;
4073         else
4074                 table = RT6_TABLE_UNSPEC;
4075         rtm->rtm_table = table;
4076         if (nla_put_u32(skb, RTA_TABLE, table))
4077                 goto nla_put_failure;
4078         if (rt->rt6i_flags & RTF_REJECT) {
4079                 switch (rt->dst.error) {
4080                 case -EINVAL:
4081                         rtm->rtm_type = RTN_BLACKHOLE;
4082                         break;
4083                 case -EACCES:
4084                         rtm->rtm_type = RTN_PROHIBIT;
4085                         break;
4086                 case -EAGAIN:
4087                         rtm->rtm_type = RTN_THROW;
4088                         break;
4089                 default:
4090                         rtm->rtm_type = RTN_UNREACHABLE;
4091                         break;
4092                 }
4093         }
4094         else if (rt->rt6i_flags & RTF_LOCAL)
4095                 rtm->rtm_type = RTN_LOCAL;
4096         else if (rt->rt6i_flags & RTF_ANYCAST)
4097                 rtm->rtm_type = RTN_ANYCAST;
4098         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4099                 rtm->rtm_type = RTN_LOCAL;
4100         else
4101                 rtm->rtm_type = RTN_UNICAST;
4102         rtm->rtm_flags = 0;
4103         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4104         rtm->rtm_protocol = rt->rt6i_protocol;
4105
4106         if (rt->rt6i_flags & RTF_CACHE)
4107                 rtm->rtm_flags |= RTM_F_CLONED;
4108
4109         if (dst) {
4110                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4111                         goto nla_put_failure;
4112                 rtm->rtm_dst_len = 128;
4113         } else if (rtm->rtm_dst_len)
4114                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4115                         goto nla_put_failure;
4116 #ifdef CONFIG_IPV6_SUBTREES
4117         if (src) {
4118                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4119                         goto nla_put_failure;
4120                 rtm->rtm_src_len = 128;
4121         } else if (rtm->rtm_src_len &&
4122                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4123                 goto nla_put_failure;
4124 #endif
4125         if (iif) {
4126 #ifdef CONFIG_IPV6_MROUTE
4127                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4128                         int err = ip6mr_get_route(net, skb, rtm, portid);
4129
4130                         if (err == 0)
4131                                 return 0;
4132                         if (err < 0)
4133                                 goto nla_put_failure;
4134                 } else
4135 #endif
4136                         if (nla_put_u32(skb, RTA_IIF, iif))
4137                                 goto nla_put_failure;
4138         } else if (dst) {
4139                 struct in6_addr saddr_buf;
4140                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4141                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4142                         goto nla_put_failure;
4143         }
4144
4145         if (rt->rt6i_prefsrc.plen) {
4146                 struct in6_addr saddr_buf;
4147                 saddr_buf = rt->rt6i_prefsrc.addr;
4148                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4149                         goto nla_put_failure;
4150         }
4151
4152         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4153         if (rt->rt6i_pmtu)
4154                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4155         if (rtnetlink_put_metrics(skb, metrics) < 0)
4156                 goto nla_put_failure;
4157
4158         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4159                 goto nla_put_failure;
4160
4161         /* For multipath routes, walk the siblings list and add
4162          * each as a nexthop within RTA_MULTIPATH.
4163          */
4164         if (rt->rt6i_nsiblings) {
4165                 struct rt6_info *sibling, *next_sibling;
4166                 struct nlattr *mp;
4167
4168                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4169                 if (!mp)
4170                         goto nla_put_failure;
4171
4172                 if (rt6_add_nexthop(skb, rt) < 0)
4173                         goto nla_put_failure;
4174
4175                 list_for_each_entry_safe(sibling, next_sibling,
4176                                          &rt->rt6i_siblings, rt6i_siblings) {
4177                         if (rt6_add_nexthop(skb, sibling) < 0)
4178                                 goto nla_put_failure;
4179                 }
4180
4181                 nla_nest_end(skb, mp);
4182         } else {
4183                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4184                         goto nla_put_failure;
4185         }
4186
4187         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4188
4189         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4190                 goto nla_put_failure;
4191
4192         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4193                 goto nla_put_failure;
4194
4195
4196         nlmsg_end(skb, nlh);
4197         return 0;
4198
4199 nla_put_failure:
4200         nlmsg_cancel(skb, nlh);
4201         return -EMSGSIZE;
4202 }
4203
4204 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4205 {
4206         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4207         struct net *net = arg->net;
4208
4209         if (rt == net->ipv6.ip6_null_entry)
4210                 return 0;
4211
4212         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4213                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4214
4215                 /* user wants prefix routes only */
4216                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4217                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4218                         /* success since this is not a prefix route */
4219                         return 1;
4220                 }
4221         }
4222
4223         return rt6_fill_node(net,
4224                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4225                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4226                      NLM_F_MULTI);
4227 }
4228
4229 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4230                               struct netlink_ext_ack *extack)
4231 {
4232         struct net *net = sock_net(in_skb->sk);
4233         struct nlattr *tb[RTA_MAX+1];
4234         int err, iif = 0, oif = 0;
4235         struct dst_entry *dst;
4236         struct rt6_info *rt;
4237         struct sk_buff *skb;
4238         struct rtmsg *rtm;
4239         struct flowi6 fl6;
4240         bool fibmatch;
4241
4242         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4243                           extack);
4244         if (err < 0)
4245                 goto errout;
4246
4247         err = -EINVAL;
4248         memset(&fl6, 0, sizeof(fl6));
4249         rtm = nlmsg_data(nlh);
4250         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4251         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4252
4253         if (tb[RTA_SRC]) {
4254                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4255                         goto errout;
4256
4257                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4258         }
4259
4260         if (tb[RTA_DST]) {
4261                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4262                         goto errout;
4263
4264                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4265         }
4266
4267         if (tb[RTA_IIF])
4268                 iif = nla_get_u32(tb[RTA_IIF]);
4269
4270         if (tb[RTA_OIF])
4271                 oif = nla_get_u32(tb[RTA_OIF]);
4272
4273         if (tb[RTA_MARK])
4274                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4275
4276         if (tb[RTA_UID])
4277                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4278                                            nla_get_u32(tb[RTA_UID]));
4279         else
4280                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4281
4282         if (iif) {
4283                 struct net_device *dev;
4284                 int flags = 0;
4285
4286                 rcu_read_lock();
4287
4288                 dev = dev_get_by_index_rcu(net, iif);
4289                 if (!dev) {
4290                         rcu_read_unlock();
4291                         err = -ENODEV;
4292                         goto errout;
4293                 }
4294
4295                 fl6.flowi6_iif = iif;
4296
4297                 if (!ipv6_addr_any(&fl6.saddr))
4298                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4299
4300                 if (!fibmatch)
4301                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4302                 else
4303                         dst = ip6_route_lookup(net, &fl6, 0);
4304
4305                 rcu_read_unlock();
4306         } else {
4307                 fl6.flowi6_oif = oif;
4308
4309                 if (!fibmatch)
4310                         dst = ip6_route_output(net, NULL, &fl6);
4311                 else
4312                         dst = ip6_route_lookup(net, &fl6, 0);
4313         }
4314
4315
4316         rt = container_of(dst, struct rt6_info, dst);
4317         if (rt->dst.error) {
4318                 err = rt->dst.error;
4319                 ip6_rt_put(rt);
4320                 goto errout;
4321         }
4322
4323         if (rt == net->ipv6.ip6_null_entry) {
4324                 err = rt->dst.error;
4325                 ip6_rt_put(rt);
4326                 goto errout;
4327         }
4328
4329         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4330         if (!skb) {
4331                 ip6_rt_put(rt);
4332                 err = -ENOBUFS;
4333                 goto errout;
4334         }
4335
4336         skb_dst_set(skb, &rt->dst);
4337         if (fibmatch)
4338                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4339                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4340                                     nlh->nlmsg_seq, 0);
4341         else
4342                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4343                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4344                                     nlh->nlmsg_seq, 0);
4345         if (err < 0) {
4346                 kfree_skb(skb);
4347                 goto errout;
4348         }
4349
4350         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4351 errout:
4352         return err;
4353 }
4354
4355 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4356                      unsigned int nlm_flags)
4357 {
4358         struct sk_buff *skb;
4359         struct net *net = info->nl_net;
4360         u32 seq;
4361         int err;
4362
4363         err = -ENOBUFS;
4364         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4365
4366         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4367         if (!skb)
4368                 goto errout;
4369
4370         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4371                                 event, info->portid, seq, nlm_flags);
4372         if (err < 0) {
4373                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4374                 WARN_ON(err == -EMSGSIZE);
4375                 kfree_skb(skb);
4376                 goto errout;
4377         }
4378         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4379                     info->nlh, gfp_any());
4380         return;
4381 errout:
4382         if (err < 0)
4383                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4384 }
4385
4386 static int ip6_route_dev_notify(struct notifier_block *this,
4387                                 unsigned long event, void *ptr)
4388 {
4389         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4390         struct net *net = dev_net(dev);
4391
4392         if (!(dev->flags & IFF_LOOPBACK))
4393                 return NOTIFY_OK;
4394
4395         if (event == NETDEV_REGISTER) {
4396                 net->ipv6.ip6_null_entry->dst.dev = dev;
4397                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4398 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4399                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4400                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4401                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4402                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4403 #endif
4404          } else if (event == NETDEV_UNREGISTER &&
4405                     dev->reg_state != NETREG_UNREGISTERED) {
4406                 /* NETDEV_UNREGISTER could be fired for multiple times by
4407                  * netdev_wait_allrefs(). Make sure we only call this once.
4408                  */
4409                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4410 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4411                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4412                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4413 #endif
4414         }
4415
4416         return NOTIFY_OK;
4417 }
4418
4419 /*
4420  *      /proc
4421  */
4422
4423 #ifdef CONFIG_PROC_FS
4424
4425 static const struct file_operations ipv6_route_proc_fops = {
4426         .owner          = THIS_MODULE,
4427         .open           = ipv6_route_open,
4428         .read           = seq_read,
4429         .llseek         = seq_lseek,
4430         .release        = seq_release_net,
4431 };
4432
4433 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4434 {
4435         struct net *net = (struct net *)seq->private;
4436         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4437                    net->ipv6.rt6_stats->fib_nodes,
4438                    net->ipv6.rt6_stats->fib_route_nodes,
4439                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4440                    net->ipv6.rt6_stats->fib_rt_entries,
4441                    net->ipv6.rt6_stats->fib_rt_cache,
4442                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4443                    net->ipv6.rt6_stats->fib_discarded_routes);
4444
4445         return 0;
4446 }
4447
4448 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4449 {
4450         return single_open_net(inode, file, rt6_stats_seq_show);
4451 }
4452
4453 static const struct file_operations rt6_stats_seq_fops = {
4454         .owner   = THIS_MODULE,
4455         .open    = rt6_stats_seq_open,
4456         .read    = seq_read,
4457         .llseek  = seq_lseek,
4458         .release = single_release_net,
4459 };
4460 #endif  /* CONFIG_PROC_FS */
4461
4462 #ifdef CONFIG_SYSCTL
4463
4464 static
4465 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4466                               void __user *buffer, size_t *lenp, loff_t *ppos)
4467 {
4468         struct net *net;
4469         int delay;
4470         if (!write)
4471                 return -EINVAL;
4472
4473         net = (struct net *)ctl->extra1;
4474         delay = net->ipv6.sysctl.flush_delay;
4475         proc_dointvec(ctl, write, buffer, lenp, ppos);
4476         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4477         return 0;
4478 }
4479
4480 struct ctl_table ipv6_route_table_template[] = {
4481         {
4482                 .procname       =       "flush",
4483                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4484                 .maxlen         =       sizeof(int),
4485                 .mode           =       0200,
4486                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4487         },
4488         {
4489                 .procname       =       "gc_thresh",
4490                 .data           =       &ip6_dst_ops_template.gc_thresh,
4491                 .maxlen         =       sizeof(int),
4492                 .mode           =       0644,
4493                 .proc_handler   =       proc_dointvec,
4494         },
4495         {
4496                 .procname       =       "max_size",
4497                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4498                 .maxlen         =       sizeof(int),
4499                 .mode           =       0644,
4500                 .proc_handler   =       proc_dointvec,
4501         },
4502         {
4503                 .procname       =       "gc_min_interval",
4504                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4505                 .maxlen         =       sizeof(int),
4506                 .mode           =       0644,
4507                 .proc_handler   =       proc_dointvec_jiffies,
4508         },
4509         {
4510                 .procname       =       "gc_timeout",
4511                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4512                 .maxlen         =       sizeof(int),
4513                 .mode           =       0644,
4514                 .proc_handler   =       proc_dointvec_jiffies,
4515         },
4516         {
4517                 .procname       =       "gc_interval",
4518                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4519                 .maxlen         =       sizeof(int),
4520                 .mode           =       0644,
4521                 .proc_handler   =       proc_dointvec_jiffies,
4522         },
4523         {
4524                 .procname       =       "gc_elasticity",
4525                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4526                 .maxlen         =       sizeof(int),
4527                 .mode           =       0644,
4528                 .proc_handler   =       proc_dointvec,
4529         },
4530         {
4531                 .procname       =       "mtu_expires",
4532                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4533                 .maxlen         =       sizeof(int),
4534                 .mode           =       0644,
4535                 .proc_handler   =       proc_dointvec_jiffies,
4536         },
4537         {
4538                 .procname       =       "min_adv_mss",
4539                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4540                 .maxlen         =       sizeof(int),
4541                 .mode           =       0644,
4542                 .proc_handler   =       proc_dointvec,
4543         },
4544         {
4545                 .procname       =       "gc_min_interval_ms",
4546                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4547                 .maxlen         =       sizeof(int),
4548                 .mode           =       0644,
4549                 .proc_handler   =       proc_dointvec_ms_jiffies,
4550         },
4551         { }
4552 };
4553
4554 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4555 {
4556         struct ctl_table *table;
4557
4558         table = kmemdup(ipv6_route_table_template,
4559                         sizeof(ipv6_route_table_template),
4560                         GFP_KERNEL);
4561
4562         if (table) {
4563                 table[0].data = &net->ipv6.sysctl.flush_delay;
4564                 table[0].extra1 = net;
4565                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4566                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4567                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4568                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4569                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4570                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4571                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4572                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4573                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4574
4575                 /* Don't export sysctls to unprivileged users */
4576                 if (net->user_ns != &init_user_ns)
4577                         table[0].procname = NULL;
4578         }
4579
4580         return table;
4581 }
4582 #endif
4583
4584 static int __net_init ip6_route_net_init(struct net *net)
4585 {
4586         int ret = -ENOMEM;
4587
4588         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4589                sizeof(net->ipv6.ip6_dst_ops));
4590
4591         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4592                 goto out_ip6_dst_ops;
4593
4594         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4595                                            sizeof(*net->ipv6.ip6_null_entry),
4596                                            GFP_KERNEL);
4597         if (!net->ipv6.ip6_null_entry)
4598                 goto out_ip6_dst_entries;
4599         net->ipv6.ip6_null_entry->dst.path =
4600                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4601         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4602         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4603                          ip6_template_metrics, true);
4604
4605 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4606         net->ipv6.fib6_has_custom_rules = false;
4607         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4608                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4609                                                GFP_KERNEL);
4610         if (!net->ipv6.ip6_prohibit_entry)
4611                 goto out_ip6_null_entry;
4612         net->ipv6.ip6_prohibit_entry->dst.path =
4613                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4614         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4615         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4616                          ip6_template_metrics, true);
4617
4618         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4619                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4620                                                GFP_KERNEL);
4621         if (!net->ipv6.ip6_blk_hole_entry)
4622                 goto out_ip6_prohibit_entry;
4623         net->ipv6.ip6_blk_hole_entry->dst.path =
4624                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4625         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4626         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4627                          ip6_template_metrics, true);
4628 #endif
4629
4630         net->ipv6.sysctl.flush_delay = 0;
4631         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4632         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4633         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4634         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4635         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4636         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4637         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4638
4639         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4640
4641         ret = 0;
4642 out:
4643         return ret;
4644
4645 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4646 out_ip6_prohibit_entry:
4647         kfree(net->ipv6.ip6_prohibit_entry);
4648 out_ip6_null_entry:
4649         kfree(net->ipv6.ip6_null_entry);
4650 #endif
4651 out_ip6_dst_entries:
4652         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4653 out_ip6_dst_ops:
4654         goto out;
4655 }
4656
4657 static void __net_exit ip6_route_net_exit(struct net *net)
4658 {
4659         kfree(net->ipv6.ip6_null_entry);
4660 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4661         kfree(net->ipv6.ip6_prohibit_entry);
4662         kfree(net->ipv6.ip6_blk_hole_entry);
4663 #endif
4664         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4665 }
4666
4667 static int __net_init ip6_route_net_init_late(struct net *net)
4668 {
4669 #ifdef CONFIG_PROC_FS
4670         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4671         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4672 #endif
4673         return 0;
4674 }
4675
4676 static void __net_exit ip6_route_net_exit_late(struct net *net)
4677 {
4678 #ifdef CONFIG_PROC_FS
4679         remove_proc_entry("ipv6_route", net->proc_net);
4680         remove_proc_entry("rt6_stats", net->proc_net);
4681 #endif
4682 }
4683
4684 static struct pernet_operations ip6_route_net_ops = {
4685         .init = ip6_route_net_init,
4686         .exit = ip6_route_net_exit,
4687 };
4688
4689 static int __net_init ipv6_inetpeer_init(struct net *net)
4690 {
4691         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4692
4693         if (!bp)
4694                 return -ENOMEM;
4695         inet_peer_base_init(bp);
4696         net->ipv6.peers = bp;
4697         return 0;
4698 }
4699
4700 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4701 {
4702         struct inet_peer_base *bp = net->ipv6.peers;
4703
4704         net->ipv6.peers = NULL;
4705         inetpeer_invalidate_tree(bp);
4706         kfree(bp);
4707 }
4708
4709 static struct pernet_operations ipv6_inetpeer_ops = {
4710         .init   =       ipv6_inetpeer_init,
4711         .exit   =       ipv6_inetpeer_exit,
4712 };
4713
4714 static struct pernet_operations ip6_route_net_late_ops = {
4715         .init = ip6_route_net_init_late,
4716         .exit = ip6_route_net_exit_late,
4717 };
4718
4719 static struct notifier_block ip6_route_dev_notifier = {
4720         .notifier_call = ip6_route_dev_notify,
4721         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4722 };
4723
4724 void __init ip6_route_init_special_entries(void)
4725 {
4726         /* Registering of the loopback is done before this portion of code,
4727          * the loopback reference in rt6_info will not be taken, do it
4728          * manually for init_net */
4729         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4730         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4731   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4732         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4733         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4734         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4735         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4736   #endif
4737 }
4738
4739 int __init ip6_route_init(void)
4740 {
4741         int ret;
4742         int cpu;
4743
4744         ret = -ENOMEM;
4745         ip6_dst_ops_template.kmem_cachep =
4746                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4747                                   SLAB_HWCACHE_ALIGN, NULL);
4748         if (!ip6_dst_ops_template.kmem_cachep)
4749                 goto out;
4750
4751         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4752         if (ret)
4753                 goto out_kmem_cache;
4754
4755         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4756         if (ret)
4757                 goto out_dst_entries;
4758
4759         ret = register_pernet_subsys(&ip6_route_net_ops);
4760         if (ret)
4761                 goto out_register_inetpeer;
4762
4763         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4764
4765         ret = fib6_init();
4766         if (ret)
4767                 goto out_register_subsys;
4768
4769         ret = xfrm6_init();
4770         if (ret)
4771                 goto out_fib6_init;
4772
4773         ret = fib6_rules_init();
4774         if (ret)
4775                 goto xfrm6_init;
4776
4777         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4778         if (ret)
4779                 goto fib6_rules_init;
4780
4781         ret = -ENOBUFS;
4782         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4783             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4784             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4785                             RTNL_FLAG_DOIT_UNLOCKED))
4786                 goto out_register_late_subsys;
4787
4788         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4789         if (ret)
4790                 goto out_register_late_subsys;
4791
4792         for_each_possible_cpu(cpu) {
4793                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4794
4795                 INIT_LIST_HEAD(&ul->head);
4796                 spin_lock_init(&ul->lock);
4797         }
4798
4799 out:
4800         return ret;
4801
4802 out_register_late_subsys:
4803         unregister_pernet_subsys(&ip6_route_net_late_ops);
4804 fib6_rules_init:
4805         fib6_rules_cleanup();
4806 xfrm6_init:
4807         xfrm6_fini();
4808 out_fib6_init:
4809         fib6_gc_cleanup();
4810 out_register_subsys:
4811         unregister_pernet_subsys(&ip6_route_net_ops);
4812 out_register_inetpeer:
4813         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4814 out_dst_entries:
4815         dst_entries_destroy(&ip6_dst_blackhole_ops);
4816 out_kmem_cache:
4817         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4818         goto out;
4819 }
4820
4821 void ip6_route_cleanup(void)
4822 {
4823         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4824         unregister_pernet_subsys(&ip6_route_net_late_ops);
4825         fib6_rules_cleanup();
4826         xfrm6_fini();
4827         fib6_gc_cleanup();
4828         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4829         unregister_pernet_subsys(&ip6_route_net_ops);
4830         dst_entries_destroy(&ip6_dst_blackhole_ops);
4831         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4832 }