]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv6/route.c
Merge branch 'fib6-rcu'
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(rt->dst.from);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (rt->rt6i_pcpu) {
381                         int cpu;
382
383                         for_each_possible_cpu(cpu) {
384                                 struct rt6_info **p;
385
386                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
387                                 /* no one shares rt */
388                                 *p =  NULL;
389                         }
390                 } else {
391                         dst_release_immediate(&rt->dst);
392                         return NULL;
393                 }
394         }
395
396         return rt;
397 }
398 EXPORT_SYMBOL(ip6_dst_alloc);
399
400 static void ip6_dst_destroy(struct dst_entry *dst)
401 {
402         struct rt6_info *rt = (struct rt6_info *)dst;
403         struct rt6_exception_bucket *bucket;
404         struct dst_entry *from = dst->from;
405         struct inet6_dev *idev;
406
407         dst_destroy_metrics_generic(dst);
408         free_percpu(rt->rt6i_pcpu);
409         rt6_uncached_list_del(rt);
410
411         idev = rt->rt6i_idev;
412         if (idev) {
413                 rt->rt6i_idev = NULL;
414                 in6_dev_put(idev);
415         }
416         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
417         if (bucket) {
418                 rt->rt6i_exception_bucket = NULL;
419                 kfree(bucket);
420         }
421
422         dst->from = NULL;
423         dst_release(from);
424 }
425
426 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
427                            int how)
428 {
429         struct rt6_info *rt = (struct rt6_info *)dst;
430         struct inet6_dev *idev = rt->rt6i_idev;
431         struct net_device *loopback_dev =
432                 dev_net(dev)->loopback_dev;
433
434         if (idev && idev->dev != loopback_dev) {
435                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
436                 if (loopback_idev) {
437                         rt->rt6i_idev = loopback_idev;
438                         in6_dev_put(idev);
439                 }
440         }
441 }
442
443 static bool __rt6_check_expired(const struct rt6_info *rt)
444 {
445         if (rt->rt6i_flags & RTF_EXPIRES)
446                 return time_after(jiffies, rt->dst.expires);
447         else
448                 return false;
449 }
450
451 static bool rt6_check_expired(const struct rt6_info *rt)
452 {
453         if (rt->rt6i_flags & RTF_EXPIRES) {
454                 if (time_after(jiffies, rt->dst.expires))
455                         return true;
456         } else if (rt->dst.from) {
457                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
458                        rt6_check_expired((struct rt6_info *)rt->dst.from);
459         }
460         return false;
461 }
462
463 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
464                                              struct flowi6 *fl6, int oif,
465                                              int strict)
466 {
467         struct rt6_info *sibling, *next_sibling;
468         int route_choosen;
469
470         /* We might have already computed the hash for ICMPv6 errors. In such
471          * case it will always be non-zero. Otherwise now is the time to do it.
472          */
473         if (!fl6->mp_hash)
474                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
475
476         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
477         /* Don't change the route, if route_choosen == 0
478          * (siblings does not include ourself)
479          */
480         if (route_choosen)
481                 list_for_each_entry_safe(sibling, next_sibling,
482                                 &match->rt6i_siblings, rt6i_siblings) {
483                         route_choosen--;
484                         if (route_choosen == 0) {
485                                 if (rt6_score_route(sibling, oif, strict) < 0)
486                                         break;
487                                 match = sibling;
488                                 break;
489                         }
490                 }
491         return match;
492 }
493
494 /*
495  *      Route lookup. rcu_read_lock() should be held.
496  */
497
498 static inline struct rt6_info *rt6_device_match(struct net *net,
499                                                     struct rt6_info *rt,
500                                                     const struct in6_addr *saddr,
501                                                     int oif,
502                                                     int flags)
503 {
504         struct rt6_info *local = NULL;
505         struct rt6_info *sprt;
506
507         if (!oif && ipv6_addr_any(saddr))
508                 goto out;
509
510         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
511                 struct net_device *dev = sprt->dst.dev;
512
513                 if (oif) {
514                         if (dev->ifindex == oif)
515                                 return sprt;
516                         if (dev->flags & IFF_LOOPBACK) {
517                                 if (!sprt->rt6i_idev ||
518                                     sprt->rt6i_idev->dev->ifindex != oif) {
519                                         if (flags & RT6_LOOKUP_F_IFACE)
520                                                 continue;
521                                         if (local &&
522                                             local->rt6i_idev->dev->ifindex == oif)
523                                                 continue;
524                                 }
525                                 local = sprt;
526                         }
527                 } else {
528                         if (ipv6_chk_addr(net, saddr, dev,
529                                           flags & RT6_LOOKUP_F_IFACE))
530                                 return sprt;
531                 }
532         }
533
534         if (oif) {
535                 if (local)
536                         return local;
537
538                 if (flags & RT6_LOOKUP_F_IFACE)
539                         return net->ipv6.ip6_null_entry;
540         }
541 out:
542         return rt;
543 }
544
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547         struct work_struct work;
548         struct in6_addr target;
549         struct net_device *dev;
550 };
551
552 static void rt6_probe_deferred(struct work_struct *w)
553 {
554         struct in6_addr mcaddr;
555         struct __rt6_probe_work *work =
556                 container_of(w, struct __rt6_probe_work, work);
557
558         addrconf_addr_solict_mult(&work->target, &mcaddr);
559         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560         dev_put(work->dev);
561         kfree(work);
562 }
563
564 static void rt6_probe(struct rt6_info *rt)
565 {
566         struct __rt6_probe_work *work;
567         struct neighbour *neigh;
568         /*
569          * Okay, this does not seem to be appropriate
570          * for now, however, we need to check if it
571          * is really so; aka Router Reachability Probing.
572          *
573          * Router Reachability Probe MUST be rate-limited
574          * to no more than one per minute.
575          */
576         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
577                 return;
578         rcu_read_lock_bh();
579         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
580         if (neigh) {
581                 if (neigh->nud_state & NUD_VALID)
582                         goto out;
583
584                 work = NULL;
585                 write_lock(&neigh->lock);
586                 if (!(neigh->nud_state & NUD_VALID) &&
587                     time_after(jiffies,
588                                neigh->updated +
589                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
590                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
591                         if (work)
592                                 __neigh_set_probe_once(neigh);
593                 }
594                 write_unlock(&neigh->lock);
595         } else {
596                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
597         }
598
599         if (work) {
600                 INIT_WORK(&work->work, rt6_probe_deferred);
601                 work->target = rt->rt6i_gateway;
602                 dev_hold(rt->dst.dev);
603                 work->dev = rt->dst.dev;
604                 schedule_work(&work->work);
605         }
606
607 out:
608         rcu_read_unlock_bh();
609 }
610 #else
611 static inline void rt6_probe(struct rt6_info *rt)
612 {
613 }
614 #endif
615
616 /*
617  * Default Router Selection (RFC 2461 6.3.6)
618  */
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
620 {
621         struct net_device *dev = rt->dst.dev;
622         if (!oif || dev->ifindex == oif)
623                 return 2;
624         if ((dev->flags & IFF_LOOPBACK) &&
625             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626                 return 1;
627         return 0;
628 }
629
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
631 {
632         struct neighbour *neigh;
633         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
634
635         if (rt->rt6i_flags & RTF_NONEXTHOP ||
636             !(rt->rt6i_flags & RTF_GATEWAY))
637                 return RT6_NUD_SUCCEED;
638
639         rcu_read_lock_bh();
640         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
641         if (neigh) {
642                 read_lock(&neigh->lock);
643                 if (neigh->nud_state & NUD_VALID)
644                         ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646                 else if (!(neigh->nud_state & NUD_FAILED))
647                         ret = RT6_NUD_SUCCEED;
648                 else
649                         ret = RT6_NUD_FAIL_PROBE;
650 #endif
651                 read_unlock(&neigh->lock);
652         } else {
653                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
655         }
656         rcu_read_unlock_bh();
657
658         return ret;
659 }
660
661 static int rt6_score_route(struct rt6_info *rt, int oif,
662                            int strict)
663 {
664         int m;
665
666         m = rt6_check_dev(rt, oif);
667         if (!m && (strict & RT6_LOOKUP_F_IFACE))
668                 return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
671 #endif
672         if (strict & RT6_LOOKUP_F_REACHABLE) {
673                 int n = rt6_check_neigh(rt);
674                 if (n < 0)
675                         return n;
676         }
677         return m;
678 }
679
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681                                    int *mpri, struct rt6_info *match,
682                                    bool *do_rr)
683 {
684         int m;
685         bool match_do_rr = false;
686         struct inet6_dev *idev = rt->rt6i_idev;
687         struct net_device *dev = rt->dst.dev;
688
689         if (dev && !netif_carrier_ok(dev) &&
690             idev->cnf.ignore_routes_with_linkdown &&
691             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
692                 goto out;
693
694         if (rt6_check_expired(rt))
695                 goto out;
696
697         m = rt6_score_route(rt, oif, strict);
698         if (m == RT6_NUD_FAIL_DO_RR) {
699                 match_do_rr = true;
700                 m = 0; /* lowest valid score */
701         } else if (m == RT6_NUD_FAIL_HARD) {
702                 goto out;
703         }
704
705         if (strict & RT6_LOOKUP_F_REACHABLE)
706                 rt6_probe(rt);
707
708         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
709         if (m > *mpri) {
710                 *do_rr = match_do_rr;
711                 *mpri = m;
712                 match = rt;
713         }
714 out:
715         return match;
716 }
717
718 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
719                                      struct rt6_info *leaf,
720                                      struct rt6_info *rr_head,
721                                      u32 metric, int oif, int strict,
722                                      bool *do_rr)
723 {
724         struct rt6_info *rt, *match, *cont;
725         int mpri = -1;
726
727         match = NULL;
728         cont = NULL;
729         for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
730                 if (rt->rt6i_metric != metric) {
731                         cont = rt;
732                         break;
733                 }
734
735                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736         }
737
738         for (rt = leaf; rt && rt != rr_head;
739              rt = rcu_dereference(rt->dst.rt6_next)) {
740                 if (rt->rt6i_metric != metric) {
741                         cont = rt;
742                         break;
743                 }
744
745                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
746         }
747
748         if (match || !cont)
749                 return match;
750
751         for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
752                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
753
754         return match;
755 }
756
757 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
758                                    int oif, int strict)
759 {
760         struct rt6_info *leaf = rcu_dereference(fn->leaf);
761         struct rt6_info *match, *rt0;
762         bool do_rr = false;
763         int key_plen;
764
765         if (!leaf)
766                 return net->ipv6.ip6_null_entry;
767
768         rt0 = rcu_dereference(fn->rr_ptr);
769         if (!rt0)
770                 rt0 = leaf;
771
772         /* Double check to make sure fn is not an intermediate node
773          * and fn->leaf does not points to its child's leaf
774          * (This might happen if all routes under fn are deleted from
775          * the tree and fib6_repair_tree() is called on the node.)
776          */
777         key_plen = rt0->rt6i_dst.plen;
778 #ifdef CONFIG_IPV6_SUBTREES
779         if (rt0->rt6i_src.plen)
780                 key_plen = rt0->rt6i_src.plen;
781 #endif
782         if (fn->fn_bit != key_plen)
783                 return net->ipv6.ip6_null_entry;
784
785         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
786                              &do_rr);
787
788         if (do_rr) {
789                 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
790
791                 /* no entries matched; do round-robin */
792                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
793                         next = leaf;
794
795                 if (next != rt0) {
796                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
797                         /* make sure next is not being deleted from the tree */
798                         if (next->rt6i_node)
799                                 rcu_assign_pointer(fn->rr_ptr, next);
800                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
801                 }
802         }
803
804         return match ? match : net->ipv6.ip6_null_entry;
805 }
806
807 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
808 {
809         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
810 }
811
812 #ifdef CONFIG_IPV6_ROUTE_INFO
813 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
814                   const struct in6_addr *gwaddr)
815 {
816         struct net *net = dev_net(dev);
817         struct route_info *rinfo = (struct route_info *) opt;
818         struct in6_addr prefix_buf, *prefix;
819         unsigned int pref;
820         unsigned long lifetime;
821         struct rt6_info *rt;
822
823         if (len < sizeof(struct route_info)) {
824                 return -EINVAL;
825         }
826
827         /* Sanity check for prefix_len and length */
828         if (rinfo->length > 3) {
829                 return -EINVAL;
830         } else if (rinfo->prefix_len > 128) {
831                 return -EINVAL;
832         } else if (rinfo->prefix_len > 64) {
833                 if (rinfo->length < 2) {
834                         return -EINVAL;
835                 }
836         } else if (rinfo->prefix_len > 0) {
837                 if (rinfo->length < 1) {
838                         return -EINVAL;
839                 }
840         }
841
842         pref = rinfo->route_pref;
843         if (pref == ICMPV6_ROUTER_PREF_INVALID)
844                 return -EINVAL;
845
846         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
847
848         if (rinfo->length == 3)
849                 prefix = (struct in6_addr *)rinfo->prefix;
850         else {
851                 /* this function is safe */
852                 ipv6_addr_prefix(&prefix_buf,
853                                  (struct in6_addr *)rinfo->prefix,
854                                  rinfo->prefix_len);
855                 prefix = &prefix_buf;
856         }
857
858         if (rinfo->prefix_len == 0)
859                 rt = rt6_get_dflt_router(gwaddr, dev);
860         else
861                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
862                                         gwaddr, dev);
863
864         if (rt && !lifetime) {
865                 ip6_del_rt(rt);
866                 rt = NULL;
867         }
868
869         if (!rt && lifetime)
870                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
871                                         dev, pref);
872         else if (rt)
873                 rt->rt6i_flags = RTF_ROUTEINFO |
874                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
875
876         if (rt) {
877                 if (!addrconf_finite_timeout(lifetime))
878                         rt6_clean_expires(rt);
879                 else
880                         rt6_set_expires(rt, jiffies + HZ * lifetime);
881
882                 ip6_rt_put(rt);
883         }
884         return 0;
885 }
886 #endif
887
888 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
889                                         struct in6_addr *saddr)
890 {
891         struct fib6_node *pn, *sn;
892         while (1) {
893                 if (fn->fn_flags & RTN_TL_ROOT)
894                         return NULL;
895                 pn = rcu_dereference(fn->parent);
896                 sn = FIB6_SUBTREE(pn);
897                 if (sn && sn != fn)
898                         fn = fib6_lookup(sn, NULL, saddr);
899                 else
900                         fn = pn;
901                 if (fn->fn_flags & RTN_RTINFO)
902                         return fn;
903         }
904 }
905
906 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
907                           bool null_fallback)
908 {
909         struct rt6_info *rt = *prt;
910
911         if (dst_hold_safe(&rt->dst))
912                 return true;
913         if (null_fallback) {
914                 rt = net->ipv6.ip6_null_entry;
915                 dst_hold(&rt->dst);
916         } else {
917                 rt = NULL;
918         }
919         *prt = rt;
920         return false;
921 }
922
923 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
924                                              struct fib6_table *table,
925                                              struct flowi6 *fl6, int flags)
926 {
927         struct rt6_info *rt, *rt_cache;
928         struct fib6_node *fn;
929
930         rcu_read_lock();
931         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
932 restart:
933         rt = rcu_dereference(fn->leaf);
934         if (!rt) {
935                 rt = net->ipv6.ip6_null_entry;
936         } else {
937                 rt = rt6_device_match(net, rt, &fl6->saddr,
938                                       fl6->flowi6_oif, flags);
939                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
940                         rt = rt6_multipath_select(rt, fl6,
941                                                   fl6->flowi6_oif, flags);
942         }
943         if (rt == net->ipv6.ip6_null_entry) {
944                 fn = fib6_backtrack(fn, &fl6->saddr);
945                 if (fn)
946                         goto restart;
947         }
948         /* Search through exception table */
949         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
950         if (rt_cache)
951                 rt = rt_cache;
952
953         if (ip6_hold_safe(net, &rt, true))
954                 dst_use_noref(&rt->dst, jiffies);
955
956         rcu_read_unlock();
957
958         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
959
960         return rt;
961
962 }
963
964 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
965                                     int flags)
966 {
967         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
968 }
969 EXPORT_SYMBOL_GPL(ip6_route_lookup);
970
971 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
972                             const struct in6_addr *saddr, int oif, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & RTF_LOCAL) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = (struct rt6_info *)ort->dst.from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         if (prev) {
1140                 /* If someone did it before us, return prev instead */
1141                 /* release refcnt taken by ip6_rt_pcpu_alloc() */
1142                 dst_release_immediate(&pcpu_rt->dst);
1143                 /* release refcnt taken by above dst_hold() */
1144                 dst_release_immediate(&pcpu_rt->dst);
1145                 dst_hold(&prev->dst);
1146                 pcpu_rt = prev;
1147         }
1148
1149         rt6_dst_from_metrics_check(pcpu_rt);
1150         return pcpu_rt;
1151 }
1152
1153 /* exception hash table implementation
1154  */
1155 static DEFINE_SPINLOCK(rt6_exception_lock);
1156
1157 /* Remove rt6_ex from hash table and free the memory
1158  * Caller must hold rt6_exception_lock
1159  */
1160 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1161                                  struct rt6_exception *rt6_ex)
1162 {
1163         struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
1164
1165         if (!bucket || !rt6_ex)
1166                 return;
1167         rt6_ex->rt6i->rt6i_node = NULL;
1168         hlist_del_rcu(&rt6_ex->hlist);
1169         rt6_release(rt6_ex->rt6i);
1170         kfree_rcu(rt6_ex, rcu);
1171         WARN_ON_ONCE(!bucket->depth);
1172         bucket->depth--;
1173         net->ipv6.rt6_stats->fib_rt_cache--;
1174 }
1175
1176 /* Remove oldest rt6_ex in bucket and free the memory
1177  * Caller must hold rt6_exception_lock
1178  */
1179 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1180 {
1181         struct rt6_exception *rt6_ex, *oldest = NULL;
1182
1183         if (!bucket)
1184                 return;
1185
1186         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1187                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1188                         oldest = rt6_ex;
1189         }
1190         rt6_remove_exception(bucket, oldest);
1191 }
1192
1193 static u32 rt6_exception_hash(const struct in6_addr *dst,
1194                               const struct in6_addr *src)
1195 {
1196         static u32 seed __read_mostly;
1197         u32 val;
1198
1199         net_get_random_once(&seed, sizeof(seed));
1200         val = jhash(dst, sizeof(*dst), seed);
1201
1202 #ifdef CONFIG_IPV6_SUBTREES
1203         if (src)
1204                 val = jhash(src, sizeof(*src), val);
1205 #endif
1206         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1207 }
1208
1209 /* Helper function to find the cached rt in the hash table
1210  * and update bucket pointer to point to the bucket for this
1211  * (daddr, saddr) pair
1212  * Caller must hold rt6_exception_lock
1213  */
1214 static struct rt6_exception *
1215 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1216                               const struct in6_addr *daddr,
1217                               const struct in6_addr *saddr)
1218 {
1219         struct rt6_exception *rt6_ex;
1220         u32 hval;
1221
1222         if (!(*bucket) || !daddr)
1223                 return NULL;
1224
1225         hval = rt6_exception_hash(daddr, saddr);
1226         *bucket += hval;
1227
1228         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1229                 struct rt6_info *rt6 = rt6_ex->rt6i;
1230                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1231
1232 #ifdef CONFIG_IPV6_SUBTREES
1233                 if (matched && saddr)
1234                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1235 #endif
1236                 if (matched)
1237                         return rt6_ex;
1238         }
1239         return NULL;
1240 }
1241
1242 /* Helper function to find the cached rt in the hash table
1243  * and update bucket pointer to point to the bucket for this
1244  * (daddr, saddr) pair
1245  * Caller must hold rcu_read_lock()
1246  */
1247 static struct rt6_exception *
1248 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1249                          const struct in6_addr *daddr,
1250                          const struct in6_addr *saddr)
1251 {
1252         struct rt6_exception *rt6_ex;
1253         u32 hval;
1254
1255         WARN_ON_ONCE(!rcu_read_lock_held());
1256
1257         if (!(*bucket) || !daddr)
1258                 return NULL;
1259
1260         hval = rt6_exception_hash(daddr, saddr);
1261         *bucket += hval;
1262
1263         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1264                 struct rt6_info *rt6 = rt6_ex->rt6i;
1265                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1266
1267 #ifdef CONFIG_IPV6_SUBTREES
1268                 if (matched && saddr)
1269                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1270 #endif
1271                 if (matched)
1272                         return rt6_ex;
1273         }
1274         return NULL;
1275 }
1276
1277 static int rt6_insert_exception(struct rt6_info *nrt,
1278                                 struct rt6_info *ort)
1279 {
1280         struct net *net = dev_net(ort->dst.dev);
1281         struct rt6_exception_bucket *bucket;
1282         struct in6_addr *src_key = NULL;
1283         struct rt6_exception *rt6_ex;
1284         int err = 0;
1285
1286         /* ort can't be a cache or pcpu route */
1287         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1288                 ort = (struct rt6_info *)ort->dst.from;
1289         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1290
1291         spin_lock_bh(&rt6_exception_lock);
1292
1293         if (ort->exception_bucket_flushed) {
1294                 err = -EINVAL;
1295                 goto out;
1296         }
1297
1298         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1299                                         lockdep_is_held(&rt6_exception_lock));
1300         if (!bucket) {
1301                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1302                                  GFP_ATOMIC);
1303                 if (!bucket) {
1304                         err = -ENOMEM;
1305                         goto out;
1306                 }
1307                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1308         }
1309
1310 #ifdef CONFIG_IPV6_SUBTREES
1311         /* rt6i_src.plen != 0 indicates ort is in subtree
1312          * and exception table is indexed by a hash of
1313          * both rt6i_dst and rt6i_src.
1314          * Otherwise, the exception table is indexed by
1315          * a hash of only rt6i_dst.
1316          */
1317         if (ort->rt6i_src.plen)
1318                 src_key = &nrt->rt6i_src.addr;
1319 #endif
1320
1321         /* Update rt6i_prefsrc as it could be changed
1322          * in rt6_remove_prefsrc()
1323          */
1324         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1325         /* rt6_mtu_change() might lower mtu on ort.
1326          * Only insert this exception route if its mtu
1327          * is less than ort's mtu value.
1328          */
1329         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1330                 err = -EINVAL;
1331                 goto out;
1332         }
1333
1334         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1335                                                src_key);
1336         if (rt6_ex)
1337                 rt6_remove_exception(bucket, rt6_ex);
1338
1339         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1340         if (!rt6_ex) {
1341                 err = -ENOMEM;
1342                 goto out;
1343         }
1344         rt6_ex->rt6i = nrt;
1345         rt6_ex->stamp = jiffies;
1346         atomic_inc(&nrt->rt6i_ref);
1347         nrt->rt6i_node = ort->rt6i_node;
1348         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1349         bucket->depth++;
1350         net->ipv6.rt6_stats->fib_rt_cache++;
1351
1352         if (bucket->depth > FIB6_MAX_DEPTH)
1353                 rt6_exception_remove_oldest(bucket);
1354
1355 out:
1356         spin_unlock_bh(&rt6_exception_lock);
1357
1358         /* Update fn->fn_sernum to invalidate all cached dst */
1359         if (!err)
1360                 fib6_update_sernum(ort);
1361
1362         return err;
1363 }
1364
1365 void rt6_flush_exceptions(struct rt6_info *rt)
1366 {
1367         struct rt6_exception_bucket *bucket;
1368         struct rt6_exception *rt6_ex;
1369         struct hlist_node *tmp;
1370         int i;
1371
1372         spin_lock_bh(&rt6_exception_lock);
1373         /* Prevent rt6_insert_exception() to recreate the bucket list */
1374         rt->exception_bucket_flushed = 1;
1375
1376         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1377                                     lockdep_is_held(&rt6_exception_lock));
1378         if (!bucket)
1379                 goto out;
1380
1381         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1382                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1383                         rt6_remove_exception(bucket, rt6_ex);
1384                 WARN_ON_ONCE(bucket->depth);
1385                 bucket++;
1386         }
1387
1388 out:
1389         spin_unlock_bh(&rt6_exception_lock);
1390 }
1391
1392 /* Find cached rt in the hash table inside passed in rt
1393  * Caller has to hold rcu_read_lock()
1394  */
1395 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1396                                            struct in6_addr *daddr,
1397                                            struct in6_addr *saddr)
1398 {
1399         struct rt6_exception_bucket *bucket;
1400         struct in6_addr *src_key = NULL;
1401         struct rt6_exception *rt6_ex;
1402         struct rt6_info *res = NULL;
1403
1404         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1405
1406 #ifdef CONFIG_IPV6_SUBTREES
1407         /* rt6i_src.plen != 0 indicates rt is in subtree
1408          * and exception table is indexed by a hash of
1409          * both rt6i_dst and rt6i_src.
1410          * Otherwise, the exception table is indexed by
1411          * a hash of only rt6i_dst.
1412          */
1413         if (rt->rt6i_src.plen)
1414                 src_key = saddr;
1415 #endif
1416         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1417
1418         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1419                 res = rt6_ex->rt6i;
1420
1421         return res;
1422 }
1423
1424 /* Remove the passed in cached rt from the hash table that contains it */
1425 int rt6_remove_exception_rt(struct rt6_info *rt)
1426 {
1427         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1428         struct rt6_exception_bucket *bucket;
1429         struct in6_addr *src_key = NULL;
1430         struct rt6_exception *rt6_ex;
1431         int err;
1432
1433         if (!from ||
1434             !(rt->rt6i_flags | RTF_CACHE))
1435                 return -EINVAL;
1436
1437         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1438                 return -ENOENT;
1439
1440         spin_lock_bh(&rt6_exception_lock);
1441         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1442                                     lockdep_is_held(&rt6_exception_lock));
1443 #ifdef CONFIG_IPV6_SUBTREES
1444         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1445          * and exception table is indexed by a hash of
1446          * both rt6i_dst and rt6i_src.
1447          * Otherwise, the exception table is indexed by
1448          * a hash of only rt6i_dst.
1449          */
1450         if (from->rt6i_src.plen)
1451                 src_key = &rt->rt6i_src.addr;
1452 #endif
1453         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1454                                                &rt->rt6i_dst.addr,
1455                                                src_key);
1456         if (rt6_ex) {
1457                 rt6_remove_exception(bucket, rt6_ex);
1458                 err = 0;
1459         } else {
1460                 err = -ENOENT;
1461         }
1462
1463         spin_unlock_bh(&rt6_exception_lock);
1464         return err;
1465 }
1466
1467 /* Find rt6_ex which contains the passed in rt cache and
1468  * refresh its stamp
1469  */
1470 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1471 {
1472         struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1473         struct rt6_exception_bucket *bucket;
1474         struct in6_addr *src_key = NULL;
1475         struct rt6_exception *rt6_ex;
1476
1477         if (!from ||
1478             !(rt->rt6i_flags | RTF_CACHE))
1479                 return;
1480
1481         rcu_read_lock();
1482         bucket = rcu_dereference(from->rt6i_exception_bucket);
1483
1484 #ifdef CONFIG_IPV6_SUBTREES
1485         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1486          * and exception table is indexed by a hash of
1487          * both rt6i_dst and rt6i_src.
1488          * Otherwise, the exception table is indexed by
1489          * a hash of only rt6i_dst.
1490          */
1491         if (from->rt6i_src.plen)
1492                 src_key = &rt->rt6i_src.addr;
1493 #endif
1494         rt6_ex = __rt6_find_exception_rcu(&bucket,
1495                                           &rt->rt6i_dst.addr,
1496                                           src_key);
1497         if (rt6_ex)
1498                 rt6_ex->stamp = jiffies;
1499
1500         rcu_read_unlock();
1501 }
1502
1503 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1504 {
1505         struct rt6_exception_bucket *bucket;
1506         struct rt6_exception *rt6_ex;
1507         int i;
1508
1509         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1510                                         lockdep_is_held(&rt6_exception_lock));
1511
1512         if (bucket) {
1513                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1515                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1516                         }
1517                         bucket++;
1518                 }
1519         }
1520 }
1521
1522 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1523 {
1524         struct rt6_exception_bucket *bucket;
1525         struct rt6_exception *rt6_ex;
1526         int i;
1527
1528         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1529                                         lockdep_is_held(&rt6_exception_lock));
1530
1531         if (bucket) {
1532                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1533                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1534                                 struct rt6_info *entry = rt6_ex->rt6i;
1535                                 /* For RTF_CACHE with rt6i_pmtu == 0
1536                                  * (i.e. a redirected route),
1537                                  * the metrics of its rt->dst.from has already
1538                                  * been updated.
1539                                  */
1540                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1541                                         entry->rt6i_pmtu = mtu;
1542                         }
1543                         bucket++;
1544                 }
1545         }
1546 }
1547
1548 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1549
1550 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1551                                         struct in6_addr *gateway)
1552 {
1553         struct rt6_exception_bucket *bucket;
1554         struct rt6_exception *rt6_ex;
1555         struct hlist_node *tmp;
1556         int i;
1557
1558         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1559                 return;
1560
1561         spin_lock_bh(&rt6_exception_lock);
1562         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1563                                      lockdep_is_held(&rt6_exception_lock));
1564
1565         if (bucket) {
1566                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1567                         hlist_for_each_entry_safe(rt6_ex, tmp,
1568                                                   &bucket->chain, hlist) {
1569                                 struct rt6_info *entry = rt6_ex->rt6i;
1570
1571                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1572                                     RTF_CACHE_GATEWAY &&
1573                                     ipv6_addr_equal(gateway,
1574                                                     &entry->rt6i_gateway)) {
1575                                         rt6_remove_exception(bucket, rt6_ex);
1576                                 }
1577                         }
1578                         bucket++;
1579                 }
1580         }
1581
1582         spin_unlock_bh(&rt6_exception_lock);
1583 }
1584
1585 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1586                                       struct rt6_exception *rt6_ex,
1587                                       struct fib6_gc_args *gc_args,
1588                                       unsigned long now)
1589 {
1590         struct rt6_info *rt = rt6_ex->rt6i;
1591
1592         if (atomic_read(&rt->dst.__refcnt) == 1 &&
1593             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1594                 RT6_TRACE("aging clone %p\n", rt);
1595                 rt6_remove_exception(bucket, rt6_ex);
1596                 return;
1597         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1598                 struct neighbour *neigh;
1599                 __u8 neigh_flags = 0;
1600
1601                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1602                 if (neigh) {
1603                         neigh_flags = neigh->flags;
1604                         neigh_release(neigh);
1605                 }
1606                 if (!(neigh_flags & NTF_ROUTER)) {
1607                         RT6_TRACE("purging route %p via non-router but gateway\n",
1608                                   rt);
1609                         rt6_remove_exception(bucket, rt6_ex);
1610                         return;
1611                 }
1612         }
1613         gc_args->more++;
1614 }
1615
1616 void rt6_age_exceptions(struct rt6_info *rt,
1617                         struct fib6_gc_args *gc_args,
1618                         unsigned long now)
1619 {
1620         struct rt6_exception_bucket *bucket;
1621         struct rt6_exception *rt6_ex;
1622         struct hlist_node *tmp;
1623         int i;
1624
1625         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1626                 return;
1627
1628         spin_lock_bh(&rt6_exception_lock);
1629         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1630                                     lockdep_is_held(&rt6_exception_lock));
1631
1632         if (bucket) {
1633                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1634                         hlist_for_each_entry_safe(rt6_ex, tmp,
1635                                                   &bucket->chain, hlist) {
1636                                 rt6_age_examine_exception(bucket, rt6_ex,
1637                                                           gc_args, now);
1638                         }
1639                         bucket++;
1640                 }
1641         }
1642         spin_unlock_bh(&rt6_exception_lock);
1643 }
1644
1645 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1646                                int oif, struct flowi6 *fl6, int flags)
1647 {
1648         struct fib6_node *fn, *saved_fn;
1649         struct rt6_info *rt, *rt_cache;
1650         int strict = 0;
1651
1652         strict |= flags & RT6_LOOKUP_F_IFACE;
1653         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1654         if (net->ipv6.devconf_all->forwarding == 0)
1655                 strict |= RT6_LOOKUP_F_REACHABLE;
1656
1657         rcu_read_lock();
1658
1659         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1660         saved_fn = fn;
1661
1662         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1663                 oif = 0;
1664
1665 redo_rt6_select:
1666         rt = rt6_select(net, fn, oif, strict);
1667         if (rt->rt6i_nsiblings)
1668                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1669         if (rt == net->ipv6.ip6_null_entry) {
1670                 fn = fib6_backtrack(fn, &fl6->saddr);
1671                 if (fn)
1672                         goto redo_rt6_select;
1673                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1674                         /* also consider unreachable route */
1675                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1676                         fn = saved_fn;
1677                         goto redo_rt6_select;
1678                 }
1679         }
1680
1681         /*Search through exception table */
1682         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1683         if (rt_cache)
1684                 rt = rt_cache;
1685
1686         if (rt == net->ipv6.ip6_null_entry) {
1687                 rcu_read_unlock();
1688                 dst_hold(&rt->dst);
1689                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1690                 return rt;
1691         } else if (rt->rt6i_flags & RTF_CACHE) {
1692                 if (ip6_hold_safe(net, &rt, true)) {
1693                         dst_use_noref(&rt->dst, jiffies);
1694                         rt6_dst_from_metrics_check(rt);
1695                 }
1696                 rcu_read_unlock();
1697                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1698                 return rt;
1699         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1700                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1701                 /* Create a RTF_CACHE clone which will not be
1702                  * owned by the fib6 tree.  It is for the special case where
1703                  * the daddr in the skb during the neighbor look-up is different
1704                  * from the fl6->daddr used to look-up route here.
1705                  */
1706
1707                 struct rt6_info *uncached_rt;
1708
1709                 if (ip6_hold_safe(net, &rt, true)) {
1710                         dst_use_noref(&rt->dst, jiffies);
1711                 } else {
1712                         rcu_read_unlock();
1713                         uncached_rt = rt;
1714                         goto uncached_rt_out;
1715                 }
1716                 rcu_read_unlock();
1717
1718                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1719                 dst_release(&rt->dst);
1720
1721                 if (uncached_rt) {
1722                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1723                          * No need for another dst_hold()
1724                          */
1725                         rt6_uncached_list_add(uncached_rt);
1726                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1727                 } else {
1728                         uncached_rt = net->ipv6.ip6_null_entry;
1729                         dst_hold(&uncached_rt->dst);
1730                 }
1731
1732 uncached_rt_out:
1733                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1734                 return uncached_rt;
1735
1736         } else {
1737                 /* Get a percpu copy */
1738
1739                 struct rt6_info *pcpu_rt;
1740
1741                 dst_use_noref(&rt->dst, jiffies);
1742                 pcpu_rt = rt6_get_pcpu_route(rt);
1743
1744                 if (pcpu_rt) {
1745                         rcu_read_unlock();
1746                 } else {
1747                         /* atomic_inc_not_zero() is needed when using rcu */
1748                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1749                                 /* We have to do the read_unlock first
1750                                  * because rt6_make_pcpu_route() may trigger
1751                                  * ip6_dst_gc() which will take the write_lock.
1752                                  *
1753                                  * No dst_hold() on rt is needed because grabbing
1754                                  * rt->rt6i_ref makes sure rt can't be released.
1755                                  */
1756                                 rcu_read_unlock();
1757                                 pcpu_rt = rt6_make_pcpu_route(rt);
1758                                 rt6_release(rt);
1759                         } else {
1760                                 /* rt is already removed from tree */
1761                                 rcu_read_unlock();
1762                                 pcpu_rt = net->ipv6.ip6_null_entry;
1763                                 dst_hold(&pcpu_rt->dst);
1764                         }
1765                 }
1766
1767                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1768                 return pcpu_rt;
1769         }
1770 }
1771 EXPORT_SYMBOL_GPL(ip6_pol_route);
1772
1773 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1774                                             struct flowi6 *fl6, int flags)
1775 {
1776         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1777 }
1778
1779 struct dst_entry *ip6_route_input_lookup(struct net *net,
1780                                          struct net_device *dev,
1781                                          struct flowi6 *fl6, int flags)
1782 {
1783         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1784                 flags |= RT6_LOOKUP_F_IFACE;
1785
1786         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1787 }
1788 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1789
1790 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1791                                   struct flow_keys *keys)
1792 {
1793         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1794         const struct ipv6hdr *key_iph = outer_iph;
1795         const struct ipv6hdr *inner_iph;
1796         const struct icmp6hdr *icmph;
1797         struct ipv6hdr _inner_iph;
1798
1799         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1800                 goto out;
1801
1802         icmph = icmp6_hdr(skb);
1803         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1804             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1805             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1806             icmph->icmp6_type != ICMPV6_PARAMPROB)
1807                 goto out;
1808
1809         inner_iph = skb_header_pointer(skb,
1810                                        skb_transport_offset(skb) + sizeof(*icmph),
1811                                        sizeof(_inner_iph), &_inner_iph);
1812         if (!inner_iph)
1813                 goto out;
1814
1815         key_iph = inner_iph;
1816 out:
1817         memset(keys, 0, sizeof(*keys));
1818         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1819         keys->addrs.v6addrs.src = key_iph->saddr;
1820         keys->addrs.v6addrs.dst = key_iph->daddr;
1821         keys->tags.flow_label = ip6_flowinfo(key_iph);
1822         keys->basic.ip_proto = key_iph->nexthdr;
1823 }
1824
1825 /* if skb is set it will be used and fl6 can be NULL */
1826 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1827 {
1828         struct flow_keys hash_keys;
1829
1830         if (skb) {
1831                 ip6_multipath_l3_keys(skb, &hash_keys);
1832                 return flow_hash_from_keys(&hash_keys);
1833         }
1834
1835         return get_hash_from_flowi6(fl6);
1836 }
1837
1838 void ip6_route_input(struct sk_buff *skb)
1839 {
1840         const struct ipv6hdr *iph = ipv6_hdr(skb);
1841         struct net *net = dev_net(skb->dev);
1842         int flags = RT6_LOOKUP_F_HAS_SADDR;
1843         struct ip_tunnel_info *tun_info;
1844         struct flowi6 fl6 = {
1845                 .flowi6_iif = skb->dev->ifindex,
1846                 .daddr = iph->daddr,
1847                 .saddr = iph->saddr,
1848                 .flowlabel = ip6_flowinfo(iph),
1849                 .flowi6_mark = skb->mark,
1850                 .flowi6_proto = iph->nexthdr,
1851         };
1852
1853         tun_info = skb_tunnel_info(skb);
1854         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1855                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1856         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1857                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1858         skb_dst_drop(skb);
1859         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1860 }
1861
1862 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1863                                              struct flowi6 *fl6, int flags)
1864 {
1865         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1866 }
1867
1868 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1869                                          struct flowi6 *fl6, int flags)
1870 {
1871         bool any_src;
1872
1873         if (rt6_need_strict(&fl6->daddr)) {
1874                 struct dst_entry *dst;
1875
1876                 dst = l3mdev_link_scope_lookup(net, fl6);
1877                 if (dst)
1878                         return dst;
1879         }
1880
1881         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1882
1883         any_src = ipv6_addr_any(&fl6->saddr);
1884         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1885             (fl6->flowi6_oif && any_src))
1886                 flags |= RT6_LOOKUP_F_IFACE;
1887
1888         if (!any_src)
1889                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1890         else if (sk)
1891                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1892
1893         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1894 }
1895 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1896
1897 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1898 {
1899         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1900         struct net_device *loopback_dev = net->loopback_dev;
1901         struct dst_entry *new = NULL;
1902
1903         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1904                        DST_OBSOLETE_NONE, 0);
1905         if (rt) {
1906                 rt6_info_init(rt);
1907                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1908
1909                 new = &rt->dst;
1910                 new->__use = 1;
1911                 new->input = dst_discard;
1912                 new->output = dst_discard_out;
1913
1914                 dst_copy_metrics(new, &ort->dst);
1915
1916                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1917                 rt->rt6i_gateway = ort->rt6i_gateway;
1918                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1919                 rt->rt6i_metric = 0;
1920
1921                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1922 #ifdef CONFIG_IPV6_SUBTREES
1923                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1924 #endif
1925         }
1926
1927         dst_release(dst_orig);
1928         return new ? new : ERR_PTR(-ENOMEM);
1929 }
1930
1931 /*
1932  *      Destination cache support functions
1933  */
1934
1935 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1936 {
1937         if (rt->dst.from &&
1938             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1939                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1940 }
1941
1942 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1943 {
1944         u32 rt_cookie = 0;
1945
1946         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1947                 return NULL;
1948
1949         if (rt6_check_expired(rt))
1950                 return NULL;
1951
1952         return &rt->dst;
1953 }
1954
1955 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1956 {
1957         if (!__rt6_check_expired(rt) &&
1958             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1959             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1960                 return &rt->dst;
1961         else
1962                 return NULL;
1963 }
1964
1965 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1966 {
1967         struct rt6_info *rt;
1968
1969         rt = (struct rt6_info *) dst;
1970
1971         /* All IPV6 dsts are created with ->obsolete set to the value
1972          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1973          * into this function always.
1974          */
1975
1976         rt6_dst_from_metrics_check(rt);
1977
1978         if (rt->rt6i_flags & RTF_PCPU ||
1979             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1980                 return rt6_dst_from_check(rt, cookie);
1981         else
1982                 return rt6_check(rt, cookie);
1983 }
1984
1985 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1986 {
1987         struct rt6_info *rt = (struct rt6_info *) dst;
1988
1989         if (rt) {
1990                 if (rt->rt6i_flags & RTF_CACHE) {
1991                         if (rt6_check_expired(rt)) {
1992                                 ip6_del_rt(rt);
1993                                 dst = NULL;
1994                         }
1995                 } else {
1996                         dst_release(dst);
1997                         dst = NULL;
1998                 }
1999         }
2000         return dst;
2001 }
2002
2003 static void ip6_link_failure(struct sk_buff *skb)
2004 {
2005         struct rt6_info *rt;
2006
2007         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2008
2009         rt = (struct rt6_info *) skb_dst(skb);
2010         if (rt) {
2011                 if (rt->rt6i_flags & RTF_CACHE) {
2012                         if (dst_hold_safe(&rt->dst))
2013                                 ip6_del_rt(rt);
2014                 } else {
2015                         struct fib6_node *fn;
2016
2017                         rcu_read_lock();
2018                         fn = rcu_dereference(rt->rt6i_node);
2019                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2020                                 fn->fn_sernum = -1;
2021                         rcu_read_unlock();
2022                 }
2023         }
2024 }
2025
2026 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2027 {
2028         struct net *net = dev_net(rt->dst.dev);
2029
2030         rt->rt6i_flags |= RTF_MODIFIED;
2031         rt->rt6i_pmtu = mtu;
2032         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2033 }
2034
2035 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2036 {
2037         return !(rt->rt6i_flags & RTF_CACHE) &&
2038                 (rt->rt6i_flags & RTF_PCPU ||
2039                  rcu_access_pointer(rt->rt6i_node));
2040 }
2041
2042 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2043                                  const struct ipv6hdr *iph, u32 mtu)
2044 {
2045         const struct in6_addr *daddr, *saddr;
2046         struct rt6_info *rt6 = (struct rt6_info *)dst;
2047
2048         if (rt6->rt6i_flags & RTF_LOCAL)
2049                 return;
2050
2051         if (dst_metric_locked(dst, RTAX_MTU))
2052                 return;
2053
2054         if (iph) {
2055                 daddr = &iph->daddr;
2056                 saddr = &iph->saddr;
2057         } else if (sk) {
2058                 daddr = &sk->sk_v6_daddr;
2059                 saddr = &inet6_sk(sk)->saddr;
2060         } else {
2061                 daddr = NULL;
2062                 saddr = NULL;
2063         }
2064         dst_confirm_neigh(dst, daddr);
2065         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2066         if (mtu >= dst_mtu(dst))
2067                 return;
2068
2069         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2070                 rt6_do_update_pmtu(rt6, mtu);
2071                 /* update rt6_ex->stamp for cache */
2072                 if (rt6->rt6i_flags & RTF_CACHE)
2073                         rt6_update_exception_stamp_rt(rt6);
2074         } else if (daddr) {
2075                 struct rt6_info *nrt6;
2076
2077                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2078                 if (nrt6) {
2079                         rt6_do_update_pmtu(nrt6, mtu);
2080                         if (rt6_insert_exception(nrt6, rt6))
2081                                 dst_release_immediate(&nrt6->dst);
2082                 }
2083         }
2084 }
2085
2086 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2087                                struct sk_buff *skb, u32 mtu)
2088 {
2089         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2090 }
2091
2092 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2093                      int oif, u32 mark, kuid_t uid)
2094 {
2095         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2096         struct dst_entry *dst;
2097         struct flowi6 fl6;
2098
2099         memset(&fl6, 0, sizeof(fl6));
2100         fl6.flowi6_oif = oif;
2101         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2102         fl6.daddr = iph->daddr;
2103         fl6.saddr = iph->saddr;
2104         fl6.flowlabel = ip6_flowinfo(iph);
2105         fl6.flowi6_uid = uid;
2106
2107         dst = ip6_route_output(net, NULL, &fl6);
2108         if (!dst->error)
2109                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2110         dst_release(dst);
2111 }
2112 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2113
2114 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2115 {
2116         struct dst_entry *dst;
2117
2118         ip6_update_pmtu(skb, sock_net(sk), mtu,
2119                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2120
2121         dst = __sk_dst_get(sk);
2122         if (!dst || !dst->obsolete ||
2123             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2124                 return;
2125
2126         bh_lock_sock(sk);
2127         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2128                 ip6_datagram_dst_update(sk, false);
2129         bh_unlock_sock(sk);
2130 }
2131 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2132
2133 /* Handle redirects */
2134 struct ip6rd_flowi {
2135         struct flowi6 fl6;
2136         struct in6_addr gateway;
2137 };
2138
2139 static struct rt6_info *__ip6_route_redirect(struct net *net,
2140                                              struct fib6_table *table,
2141                                              struct flowi6 *fl6,
2142                                              int flags)
2143 {
2144         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2145         struct rt6_info *rt, *rt_cache;
2146         struct fib6_node *fn;
2147
2148         /* Get the "current" route for this destination and
2149          * check if the redirect has come from appropriate router.
2150          *
2151          * RFC 4861 specifies that redirects should only be
2152          * accepted if they come from the nexthop to the target.
2153          * Due to the way the routes are chosen, this notion
2154          * is a bit fuzzy and one might need to check all possible
2155          * routes.
2156          */
2157
2158         rcu_read_lock();
2159         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2160 restart:
2161         for_each_fib6_node_rt_rcu(fn) {
2162                 if (rt6_check_expired(rt))
2163                         continue;
2164                 if (rt->dst.error)
2165                         break;
2166                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2167                         continue;
2168                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2169                         continue;
2170                 /* rt_cache's gateway might be different from its 'parent'
2171                  * in the case of an ip redirect.
2172                  * So we keep searching in the exception table if the gateway
2173                  * is different.
2174                  */
2175                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176                         rt_cache = rt6_find_cached_rt(rt,
2177                                                       &fl6->daddr,
2178                                                       &fl6->saddr);
2179                         if (rt_cache &&
2180                             ipv6_addr_equal(&rdfl->gateway,
2181                                             &rt_cache->rt6i_gateway)) {
2182                                 rt = rt_cache;
2183                                 break;
2184                         }
2185                         continue;
2186                 }
2187                 break;
2188         }
2189
2190         if (!rt)
2191                 rt = net->ipv6.ip6_null_entry;
2192         else if (rt->dst.error) {
2193                 rt = net->ipv6.ip6_null_entry;
2194                 goto out;
2195         }
2196
2197         if (rt == net->ipv6.ip6_null_entry) {
2198                 fn = fib6_backtrack(fn, &fl6->saddr);
2199                 if (fn)
2200                         goto restart;
2201         }
2202
2203 out:
2204         ip6_hold_safe(net, &rt, true);
2205
2206         rcu_read_unlock();
2207
2208         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
2209         return rt;
2210 };
2211
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213                                         const struct flowi6 *fl6,
2214                                         const struct in6_addr *gateway)
2215 {
2216         int flags = RT6_LOOKUP_F_HAS_SADDR;
2217         struct ip6rd_flowi rdfl;
2218
2219         rdfl.fl6 = *fl6;
2220         rdfl.gateway = *gateway;
2221
2222         return fib6_rule_lookup(net, &rdfl.fl6,
2223                                 flags, __ip6_route_redirect);
2224 }
2225
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2227                   kuid_t uid)
2228 {
2229         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230         struct dst_entry *dst;
2231         struct flowi6 fl6;
2232
2233         memset(&fl6, 0, sizeof(fl6));
2234         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235         fl6.flowi6_oif = oif;
2236         fl6.flowi6_mark = mark;
2237         fl6.daddr = iph->daddr;
2238         fl6.saddr = iph->saddr;
2239         fl6.flowlabel = ip6_flowinfo(iph);
2240         fl6.flowi6_uid = uid;
2241
2242         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243         rt6_do_redirect(dst, NULL, skb);
2244         dst_release(dst);
2245 }
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2247
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2249                             u32 mark)
2250 {
2251         const struct ipv6hdr *iph = ipv6_hdr(skb);
2252         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253         struct dst_entry *dst;
2254         struct flowi6 fl6;
2255
2256         memset(&fl6, 0, sizeof(fl6));
2257         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258         fl6.flowi6_oif = oif;
2259         fl6.flowi6_mark = mark;
2260         fl6.daddr = msg->dest;
2261         fl6.saddr = iph->daddr;
2262         fl6.flowi6_uid = sock_net_uid(net, NULL);
2263
2264         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265         rt6_do_redirect(dst, NULL, skb);
2266         dst_release(dst);
2267 }
2268
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2270 {
2271         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2272                      sk->sk_uid);
2273 }
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2275
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2277 {
2278         struct net_device *dev = dst->dev;
2279         unsigned int mtu = dst_mtu(dst);
2280         struct net *net = dev_net(dev);
2281
2282         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2283
2284         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2286
2287         /*
2288          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290          * IPV6_MAXPLEN is also valid and means: "any MSS,
2291          * rely only on pmtu discovery"
2292          */
2293         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2294                 mtu = IPV6_MAXPLEN;
2295         return mtu;
2296 }
2297
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2299 {
2300         const struct rt6_info *rt = (const struct rt6_info *)dst;
2301         unsigned int mtu = rt->rt6i_pmtu;
2302         struct inet6_dev *idev;
2303
2304         if (mtu)
2305                 goto out;
2306
2307         mtu = dst_metric_raw(dst, RTAX_MTU);
2308         if (mtu)
2309                 goto out;
2310
2311         mtu = IPV6_MIN_MTU;
2312
2313         rcu_read_lock();
2314         idev = __in6_dev_get(dst->dev);
2315         if (idev)
2316                 mtu = idev->cnf.mtu6;
2317         rcu_read_unlock();
2318
2319 out:
2320         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2321
2322         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2323 }
2324
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2326                                   struct flowi6 *fl6)
2327 {
2328         struct dst_entry *dst;
2329         struct rt6_info *rt;
2330         struct inet6_dev *idev = in6_dev_get(dev);
2331         struct net *net = dev_net(dev);
2332
2333         if (unlikely(!idev))
2334                 return ERR_PTR(-ENODEV);
2335
2336         rt = ip6_dst_alloc(net, dev, 0);
2337         if (unlikely(!rt)) {
2338                 in6_dev_put(idev);
2339                 dst = ERR_PTR(-ENOMEM);
2340                 goto out;
2341         }
2342
2343         rt->dst.flags |= DST_HOST;
2344         rt->dst.output  = ip6_output;
2345         rt->rt6i_gateway  = fl6->daddr;
2346         rt->rt6i_dst.addr = fl6->daddr;
2347         rt->rt6i_dst.plen = 128;
2348         rt->rt6i_idev     = idev;
2349         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2350
2351         /* Add this dst into uncached_list so that rt6_ifdown() can
2352          * do proper release of the net_device
2353          */
2354         rt6_uncached_list_add(rt);
2355         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2356
2357         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2358
2359 out:
2360         return dst;
2361 }
2362
2363 static int ip6_dst_gc(struct dst_ops *ops)
2364 {
2365         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2366         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2367         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2368         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2369         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2370         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2371         int entries;
2372
2373         entries = dst_entries_get_fast(ops);
2374         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2375             entries <= rt_max_size)
2376                 goto out;
2377
2378         net->ipv6.ip6_rt_gc_expire++;
2379         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2380         entries = dst_entries_get_slow(ops);
2381         if (entries < ops->gc_thresh)
2382                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2383 out:
2384         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2385         return entries > rt_max_size;
2386 }
2387
2388 static int ip6_convert_metrics(struct mx6_config *mxc,
2389                                const struct fib6_config *cfg)
2390 {
2391         bool ecn_ca = false;
2392         struct nlattr *nla;
2393         int remaining;
2394         u32 *mp;
2395
2396         if (!cfg->fc_mx)
2397                 return 0;
2398
2399         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2400         if (unlikely(!mp))
2401                 return -ENOMEM;
2402
2403         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2404                 int type = nla_type(nla);
2405                 u32 val;
2406
2407                 if (!type)
2408                         continue;
2409                 if (unlikely(type > RTAX_MAX))
2410                         goto err;
2411
2412                 if (type == RTAX_CC_ALGO) {
2413                         char tmp[TCP_CA_NAME_MAX];
2414
2415                         nla_strlcpy(tmp, nla, sizeof(tmp));
2416                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
2417                         if (val == TCP_CA_UNSPEC)
2418                                 goto err;
2419                 } else {
2420                         val = nla_get_u32(nla);
2421                 }
2422                 if (type == RTAX_HOPLIMIT && val > 255)
2423                         val = 255;
2424                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2425                         goto err;
2426
2427                 mp[type - 1] = val;
2428                 __set_bit(type - 1, mxc->mx_valid);
2429         }
2430
2431         if (ecn_ca) {
2432                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2433                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2434         }
2435
2436         mxc->mx = mp;
2437         return 0;
2438  err:
2439         kfree(mp);
2440         return -EINVAL;
2441 }
2442
2443 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2444                                             struct fib6_config *cfg,
2445                                             const struct in6_addr *gw_addr)
2446 {
2447         struct flowi6 fl6 = {
2448                 .flowi6_oif = cfg->fc_ifindex,
2449                 .daddr = *gw_addr,
2450                 .saddr = cfg->fc_prefsrc,
2451         };
2452         struct fib6_table *table;
2453         struct rt6_info *rt;
2454         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2455
2456         table = fib6_get_table(net, cfg->fc_table);
2457         if (!table)
2458                 return NULL;
2459
2460         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2461                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2462
2463         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2464
2465         /* if table lookup failed, fall back to full lookup */
2466         if (rt == net->ipv6.ip6_null_entry) {
2467                 ip6_rt_put(rt);
2468                 rt = NULL;
2469         }
2470
2471         return rt;
2472 }
2473
2474 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2475                                               struct netlink_ext_ack *extack)
2476 {
2477         struct net *net = cfg->fc_nlinfo.nl_net;
2478         struct rt6_info *rt = NULL;
2479         struct net_device *dev = NULL;
2480         struct inet6_dev *idev = NULL;
2481         struct fib6_table *table;
2482         int addr_type;
2483         int err = -EINVAL;
2484
2485         /* RTF_PCPU is an internal flag; can not be set by userspace */
2486         if (cfg->fc_flags & RTF_PCPU) {
2487                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2488                 goto out;
2489         }
2490
2491         if (cfg->fc_dst_len > 128) {
2492                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2493                 goto out;
2494         }
2495         if (cfg->fc_src_len > 128) {
2496                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2497                 goto out;
2498         }
2499 #ifndef CONFIG_IPV6_SUBTREES
2500         if (cfg->fc_src_len) {
2501                 NL_SET_ERR_MSG(extack,
2502                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2503                 goto out;
2504         }
2505 #endif
2506         if (cfg->fc_ifindex) {
2507                 err = -ENODEV;
2508                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2509                 if (!dev)
2510                         goto out;
2511                 idev = in6_dev_get(dev);
2512                 if (!idev)
2513                         goto out;
2514         }
2515
2516         if (cfg->fc_metric == 0)
2517                 cfg->fc_metric = IP6_RT_PRIO_USER;
2518
2519         err = -ENOBUFS;
2520         if (cfg->fc_nlinfo.nlh &&
2521             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2522                 table = fib6_get_table(net, cfg->fc_table);
2523                 if (!table) {
2524                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2525                         table = fib6_new_table(net, cfg->fc_table);
2526                 }
2527         } else {
2528                 table = fib6_new_table(net, cfg->fc_table);
2529         }
2530
2531         if (!table)
2532                 goto out;
2533
2534         rt = ip6_dst_alloc(net, NULL,
2535                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2536
2537         if (!rt) {
2538                 err = -ENOMEM;
2539                 goto out;
2540         }
2541
2542         if (cfg->fc_flags & RTF_EXPIRES)
2543                 rt6_set_expires(rt, jiffies +
2544                                 clock_t_to_jiffies(cfg->fc_expires));
2545         else
2546                 rt6_clean_expires(rt);
2547
2548         if (cfg->fc_protocol == RTPROT_UNSPEC)
2549                 cfg->fc_protocol = RTPROT_BOOT;
2550         rt->rt6i_protocol = cfg->fc_protocol;
2551
2552         addr_type = ipv6_addr_type(&cfg->fc_dst);
2553
2554         if (addr_type & IPV6_ADDR_MULTICAST)
2555                 rt->dst.input = ip6_mc_input;
2556         else if (cfg->fc_flags & RTF_LOCAL)
2557                 rt->dst.input = ip6_input;
2558         else
2559                 rt->dst.input = ip6_forward;
2560
2561         rt->dst.output = ip6_output;
2562
2563         if (cfg->fc_encap) {
2564                 struct lwtunnel_state *lwtstate;
2565
2566                 err = lwtunnel_build_state(cfg->fc_encap_type,
2567                                            cfg->fc_encap, AF_INET6, cfg,
2568                                            &lwtstate, extack);
2569                 if (err)
2570                         goto out;
2571                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2572                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2573                         rt->dst.lwtstate->orig_output = rt->dst.output;
2574                         rt->dst.output = lwtunnel_output;
2575                 }
2576                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2577                         rt->dst.lwtstate->orig_input = rt->dst.input;
2578                         rt->dst.input = lwtunnel_input;
2579                 }
2580         }
2581
2582         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2583         rt->rt6i_dst.plen = cfg->fc_dst_len;
2584         if (rt->rt6i_dst.plen == 128)
2585                 rt->dst.flags |= DST_HOST;
2586
2587 #ifdef CONFIG_IPV6_SUBTREES
2588         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2589         rt->rt6i_src.plen = cfg->fc_src_len;
2590 #endif
2591
2592         rt->rt6i_metric = cfg->fc_metric;
2593
2594         /* We cannot add true routes via loopback here,
2595            they would result in kernel looping; promote them to reject routes
2596          */
2597         if ((cfg->fc_flags & RTF_REJECT) ||
2598             (dev && (dev->flags & IFF_LOOPBACK) &&
2599              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2600              !(cfg->fc_flags & RTF_LOCAL))) {
2601                 /* hold loopback dev/idev if we haven't done so. */
2602                 if (dev != net->loopback_dev) {
2603                         if (dev) {
2604                                 dev_put(dev);
2605                                 in6_dev_put(idev);
2606                         }
2607                         dev = net->loopback_dev;
2608                         dev_hold(dev);
2609                         idev = in6_dev_get(dev);
2610                         if (!idev) {
2611                                 err = -ENODEV;
2612                                 goto out;
2613                         }
2614                 }
2615                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2616                 switch (cfg->fc_type) {
2617                 case RTN_BLACKHOLE:
2618                         rt->dst.error = -EINVAL;
2619                         rt->dst.output = dst_discard_out;
2620                         rt->dst.input = dst_discard;
2621                         break;
2622                 case RTN_PROHIBIT:
2623                         rt->dst.error = -EACCES;
2624                         rt->dst.output = ip6_pkt_prohibit_out;
2625                         rt->dst.input = ip6_pkt_prohibit;
2626                         break;
2627                 case RTN_THROW:
2628                 case RTN_UNREACHABLE:
2629                 default:
2630                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2631                                         : (cfg->fc_type == RTN_UNREACHABLE)
2632                                         ? -EHOSTUNREACH : -ENETUNREACH;
2633                         rt->dst.output = ip6_pkt_discard_out;
2634                         rt->dst.input = ip6_pkt_discard;
2635                         break;
2636                 }
2637                 goto install_route;
2638         }
2639
2640         if (cfg->fc_flags & RTF_GATEWAY) {
2641                 const struct in6_addr *gw_addr;
2642                 int gwa_type;
2643
2644                 gw_addr = &cfg->fc_gateway;
2645                 gwa_type = ipv6_addr_type(gw_addr);
2646
2647                 /* if gw_addr is local we will fail to detect this in case
2648                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2649                  * will return already-added prefix route via interface that
2650                  * prefix route was assigned to, which might be non-loopback.
2651                  */
2652                 err = -EINVAL;
2653                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2654                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2655                                             dev : NULL, 0, 0)) {
2656                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2657                         goto out;
2658                 }
2659                 rt->rt6i_gateway = *gw_addr;
2660
2661                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2662                         struct rt6_info *grt = NULL;
2663
2664                         /* IPv6 strictly inhibits using not link-local
2665                            addresses as nexthop address.
2666                            Otherwise, router will not able to send redirects.
2667                            It is very good, but in some (rare!) circumstances
2668                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2669                            some exceptions. --ANK
2670                            We allow IPv4-mapped nexthops to support RFC4798-type
2671                            addressing
2672                          */
2673                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2674                                           IPV6_ADDR_MAPPED))) {
2675                                 NL_SET_ERR_MSG(extack,
2676                                                "Invalid gateway address");
2677                                 goto out;
2678                         }
2679
2680                         if (cfg->fc_table) {
2681                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2682
2683                                 if (grt) {
2684                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2685                                             (dev && dev != grt->dst.dev)) {
2686                                                 ip6_rt_put(grt);
2687                                                 grt = NULL;
2688                                         }
2689                                 }
2690                         }
2691
2692                         if (!grt)
2693                                 grt = rt6_lookup(net, gw_addr, NULL,
2694                                                  cfg->fc_ifindex, 1);
2695
2696                         err = -EHOSTUNREACH;
2697                         if (!grt)
2698                                 goto out;
2699                         if (dev) {
2700                                 if (dev != grt->dst.dev) {
2701                                         ip6_rt_put(grt);
2702                                         goto out;
2703                                 }
2704                         } else {
2705                                 dev = grt->dst.dev;
2706                                 idev = grt->rt6i_idev;
2707                                 dev_hold(dev);
2708                                 in6_dev_hold(grt->rt6i_idev);
2709                         }
2710                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2711                                 err = 0;
2712                         ip6_rt_put(grt);
2713
2714                         if (err)
2715                                 goto out;
2716                 }
2717                 err = -EINVAL;
2718                 if (!dev) {
2719                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2720                         goto out;
2721                 } else if (dev->flags & IFF_LOOPBACK) {
2722                         NL_SET_ERR_MSG(extack,
2723                                        "Egress device can not be loopback device for this route");
2724                         goto out;
2725                 }
2726         }
2727
2728         err = -ENODEV;
2729         if (!dev)
2730                 goto out;
2731
2732         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2733                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2734                         NL_SET_ERR_MSG(extack, "Invalid source address");
2735                         err = -EINVAL;
2736                         goto out;
2737                 }
2738                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2739                 rt->rt6i_prefsrc.plen = 128;
2740         } else
2741                 rt->rt6i_prefsrc.plen = 0;
2742
2743         rt->rt6i_flags = cfg->fc_flags;
2744
2745 install_route:
2746         rt->dst.dev = dev;
2747         rt->rt6i_idev = idev;
2748         rt->rt6i_table = table;
2749
2750         cfg->fc_nlinfo.nl_net = dev_net(dev);
2751
2752         return rt;
2753 out:
2754         if (dev)
2755                 dev_put(dev);
2756         if (idev)
2757                 in6_dev_put(idev);
2758         if (rt)
2759                 dst_release_immediate(&rt->dst);
2760
2761         return ERR_PTR(err);
2762 }
2763
2764 int ip6_route_add(struct fib6_config *cfg,
2765                   struct netlink_ext_ack *extack)
2766 {
2767         struct mx6_config mxc = { .mx = NULL, };
2768         struct rt6_info *rt;
2769         int err;
2770
2771         rt = ip6_route_info_create(cfg, extack);
2772         if (IS_ERR(rt)) {
2773                 err = PTR_ERR(rt);
2774                 rt = NULL;
2775                 goto out;
2776         }
2777
2778         err = ip6_convert_metrics(&mxc, cfg);
2779         if (err)
2780                 goto out;
2781
2782         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2783
2784         kfree(mxc.mx);
2785
2786         return err;
2787 out:
2788         if (rt)
2789                 dst_release_immediate(&rt->dst);
2790
2791         return err;
2792 }
2793
2794 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2795 {
2796         int err;
2797         struct fib6_table *table;
2798         struct net *net = dev_net(rt->dst.dev);
2799
2800         if (rt == net->ipv6.ip6_null_entry) {
2801                 err = -ENOENT;
2802                 goto out;
2803         }
2804
2805         table = rt->rt6i_table;
2806         spin_lock_bh(&table->tb6_lock);
2807         err = fib6_del(rt, info);
2808         spin_unlock_bh(&table->tb6_lock);
2809
2810 out:
2811         ip6_rt_put(rt);
2812         return err;
2813 }
2814
2815 int ip6_del_rt(struct rt6_info *rt)
2816 {
2817         struct nl_info info = {
2818                 .nl_net = dev_net(rt->dst.dev),
2819         };
2820         return __ip6_del_rt(rt, &info);
2821 }
2822
2823 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2824 {
2825         struct nl_info *info = &cfg->fc_nlinfo;
2826         struct net *net = info->nl_net;
2827         struct sk_buff *skb = NULL;
2828         struct fib6_table *table;
2829         int err = -ENOENT;
2830
2831         if (rt == net->ipv6.ip6_null_entry)
2832                 goto out_put;
2833         table = rt->rt6i_table;
2834         spin_lock_bh(&table->tb6_lock);
2835
2836         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2837                 struct rt6_info *sibling, *next_sibling;
2838
2839                 /* prefer to send a single notification with all hops */
2840                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2841                 if (skb) {
2842                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2843
2844                         if (rt6_fill_node(net, skb, rt,
2845                                           NULL, NULL, 0, RTM_DELROUTE,
2846                                           info->portid, seq, 0) < 0) {
2847                                 kfree_skb(skb);
2848                                 skb = NULL;
2849                         } else
2850                                 info->skip_notify = 1;
2851                 }
2852
2853                 list_for_each_entry_safe(sibling, next_sibling,
2854                                          &rt->rt6i_siblings,
2855                                          rt6i_siblings) {
2856                         err = fib6_del(sibling, info);
2857                         if (err)
2858                                 goto out_unlock;
2859                 }
2860         }
2861
2862         err = fib6_del(rt, info);
2863 out_unlock:
2864         spin_unlock_bh(&table->tb6_lock);
2865 out_put:
2866         ip6_rt_put(rt);
2867
2868         if (skb) {
2869                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2870                             info->nlh, gfp_any());
2871         }
2872         return err;
2873 }
2874
2875 static int ip6_route_del(struct fib6_config *cfg,
2876                          struct netlink_ext_ack *extack)
2877 {
2878         struct rt6_info *rt, *rt_cache;
2879         struct fib6_table *table;
2880         struct fib6_node *fn;
2881         int err = -ESRCH;
2882
2883         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2884         if (!table) {
2885                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2886                 return err;
2887         }
2888
2889         rcu_read_lock();
2890
2891         fn = fib6_locate(&table->tb6_root,
2892                          &cfg->fc_dst, cfg->fc_dst_len,
2893                          &cfg->fc_src, cfg->fc_src_len,
2894                          !(cfg->fc_flags & RTF_CACHE));
2895
2896         if (fn) {
2897                 for_each_fib6_node_rt_rcu(fn) {
2898                         if (cfg->fc_flags & RTF_CACHE) {
2899                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2900                                                               &cfg->fc_src);
2901                                 if (!rt_cache)
2902                                         continue;
2903                                 rt = rt_cache;
2904                         }
2905                         if (cfg->fc_ifindex &&
2906                             (!rt->dst.dev ||
2907                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2908                                 continue;
2909                         if (cfg->fc_flags & RTF_GATEWAY &&
2910                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2911                                 continue;
2912                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2913                                 continue;
2914                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2915                                 continue;
2916                         if (!dst_hold_safe(&rt->dst))
2917                                 break;
2918                         rcu_read_unlock();
2919
2920                         /* if gateway was specified only delete the one hop */
2921                         if (cfg->fc_flags & RTF_GATEWAY)
2922                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2923
2924                         return __ip6_del_rt_siblings(rt, cfg);
2925                 }
2926         }
2927         rcu_read_unlock();
2928
2929         return err;
2930 }
2931
2932 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2933 {
2934         struct netevent_redirect netevent;
2935         struct rt6_info *rt, *nrt = NULL;
2936         struct ndisc_options ndopts;
2937         struct inet6_dev *in6_dev;
2938         struct neighbour *neigh;
2939         struct rd_msg *msg;
2940         int optlen, on_link;
2941         u8 *lladdr;
2942
2943         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2944         optlen -= sizeof(*msg);
2945
2946         if (optlen < 0) {
2947                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2948                 return;
2949         }
2950
2951         msg = (struct rd_msg *)icmp6_hdr(skb);
2952
2953         if (ipv6_addr_is_multicast(&msg->dest)) {
2954                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2955                 return;
2956         }
2957
2958         on_link = 0;
2959         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2960                 on_link = 1;
2961         } else if (ipv6_addr_type(&msg->target) !=
2962                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2963                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2964                 return;
2965         }
2966
2967         in6_dev = __in6_dev_get(skb->dev);
2968         if (!in6_dev)
2969                 return;
2970         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2971                 return;
2972
2973         /* RFC2461 8.1:
2974          *      The IP source address of the Redirect MUST be the same as the current
2975          *      first-hop router for the specified ICMP Destination Address.
2976          */
2977
2978         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2979                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2980                 return;
2981         }
2982
2983         lladdr = NULL;
2984         if (ndopts.nd_opts_tgt_lladdr) {
2985                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2986                                              skb->dev);
2987                 if (!lladdr) {
2988                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2989                         return;
2990                 }
2991         }
2992
2993         rt = (struct rt6_info *) dst;
2994         if (rt->rt6i_flags & RTF_REJECT) {
2995                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2996                 return;
2997         }
2998
2999         /* Redirect received -> path was valid.
3000          * Look, redirects are sent only in response to data packets,
3001          * so that this nexthop apparently is reachable. --ANK
3002          */
3003         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3004
3005         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3006         if (!neigh)
3007                 return;
3008
3009         /*
3010          *      We have finally decided to accept it.
3011          */
3012
3013         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3014                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3015                      NEIGH_UPDATE_F_OVERRIDE|
3016                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3017                                      NEIGH_UPDATE_F_ISROUTER)),
3018                      NDISC_REDIRECT, &ndopts);
3019
3020         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3021         if (!nrt)
3022                 goto out;
3023
3024         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3025         if (on_link)
3026                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3027
3028         nrt->rt6i_protocol = RTPROT_REDIRECT;
3029         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3030
3031         /* No need to remove rt from the exception table if rt is
3032          * a cached route because rt6_insert_exception() will
3033          * takes care of it
3034          */
3035         if (rt6_insert_exception(nrt, rt)) {
3036                 dst_release_immediate(&nrt->dst);
3037                 goto out;
3038         }
3039
3040         netevent.old = &rt->dst;
3041         netevent.new = &nrt->dst;
3042         netevent.daddr = &msg->dest;
3043         netevent.neigh = neigh;
3044         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3045
3046 out:
3047         neigh_release(neigh);
3048 }
3049
3050 /*
3051  *      Misc support functions
3052  */
3053
3054 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3055 {
3056         BUG_ON(from->dst.from);
3057
3058         rt->rt6i_flags &= ~RTF_EXPIRES;
3059         dst_hold(&from->dst);
3060         rt->dst.from = &from->dst;
3061         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3062 }
3063
3064 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3065 {
3066         rt->dst.input = ort->dst.input;
3067         rt->dst.output = ort->dst.output;
3068         rt->rt6i_dst = ort->rt6i_dst;
3069         rt->dst.error = ort->dst.error;
3070         rt->rt6i_idev = ort->rt6i_idev;
3071         if (rt->rt6i_idev)
3072                 in6_dev_hold(rt->rt6i_idev);
3073         rt->dst.lastuse = jiffies;
3074         rt->rt6i_gateway = ort->rt6i_gateway;
3075         rt->rt6i_flags = ort->rt6i_flags;
3076         rt6_set_from(rt, ort);
3077         rt->rt6i_metric = ort->rt6i_metric;
3078 #ifdef CONFIG_IPV6_SUBTREES
3079         rt->rt6i_src = ort->rt6i_src;
3080 #endif
3081         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3082         rt->rt6i_table = ort->rt6i_table;
3083         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3084 }
3085
3086 #ifdef CONFIG_IPV6_ROUTE_INFO
3087 static struct rt6_info *rt6_get_route_info(struct net *net,
3088                                            const struct in6_addr *prefix, int prefixlen,
3089                                            const struct in6_addr *gwaddr,
3090                                            struct net_device *dev)
3091 {
3092         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3093         int ifindex = dev->ifindex;
3094         struct fib6_node *fn;
3095         struct rt6_info *rt = NULL;
3096         struct fib6_table *table;
3097
3098         table = fib6_get_table(net, tb_id);
3099         if (!table)
3100                 return NULL;
3101
3102         rcu_read_lock();
3103         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3104         if (!fn)
3105                 goto out;
3106
3107         for_each_fib6_node_rt_rcu(fn) {
3108                 if (rt->dst.dev->ifindex != ifindex)
3109                         continue;
3110                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3111                         continue;
3112                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3113                         continue;
3114                 ip6_hold_safe(NULL, &rt, false);
3115                 break;
3116         }
3117 out:
3118         rcu_read_unlock();
3119         return rt;
3120 }
3121
3122 static struct rt6_info *rt6_add_route_info(struct net *net,
3123                                            const struct in6_addr *prefix, int prefixlen,
3124                                            const struct in6_addr *gwaddr,
3125                                            struct net_device *dev,
3126                                            unsigned int pref)
3127 {
3128         struct fib6_config cfg = {
3129                 .fc_metric      = IP6_RT_PRIO_USER,
3130                 .fc_ifindex     = dev->ifindex,
3131                 .fc_dst_len     = prefixlen,
3132                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3133                                   RTF_UP | RTF_PREF(pref),
3134                 .fc_protocol = RTPROT_RA,
3135                 .fc_nlinfo.portid = 0,
3136                 .fc_nlinfo.nlh = NULL,
3137                 .fc_nlinfo.nl_net = net,
3138         };
3139
3140         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3141         cfg.fc_dst = *prefix;
3142         cfg.fc_gateway = *gwaddr;
3143
3144         /* We should treat it as a default route if prefix length is 0. */
3145         if (!prefixlen)
3146                 cfg.fc_flags |= RTF_DEFAULT;
3147
3148         ip6_route_add(&cfg, NULL);
3149
3150         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3151 }
3152 #endif
3153
3154 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3155 {
3156         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3157         struct rt6_info *rt;
3158         struct fib6_table *table;
3159
3160         table = fib6_get_table(dev_net(dev), tb_id);
3161         if (!table)
3162                 return NULL;
3163
3164         rcu_read_lock();
3165         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3166                 if (dev == rt->dst.dev &&
3167                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3168                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3169                         break;
3170         }
3171         if (rt)
3172                 ip6_hold_safe(NULL, &rt, false);
3173         rcu_read_unlock();
3174         return rt;
3175 }
3176
3177 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3178                                      struct net_device *dev,
3179                                      unsigned int pref)
3180 {
3181         struct fib6_config cfg = {
3182                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3183                 .fc_metric      = IP6_RT_PRIO_USER,
3184                 .fc_ifindex     = dev->ifindex,
3185                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3186                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3187                 .fc_protocol = RTPROT_RA,
3188                 .fc_nlinfo.portid = 0,
3189                 .fc_nlinfo.nlh = NULL,
3190                 .fc_nlinfo.nl_net = dev_net(dev),
3191         };
3192
3193         cfg.fc_gateway = *gwaddr;
3194
3195         if (!ip6_route_add(&cfg, NULL)) {
3196                 struct fib6_table *table;
3197
3198                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3199                 if (table)
3200                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3201         }
3202
3203         return rt6_get_dflt_router(gwaddr, dev);
3204 }
3205
3206 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3207 {
3208         struct rt6_info *rt;
3209
3210 restart:
3211         rcu_read_lock();
3212         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3213                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3214                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3215                         if (dst_hold_safe(&rt->dst)) {
3216                                 rcu_read_unlock();
3217                                 ip6_del_rt(rt);
3218                         } else {
3219                                 rcu_read_unlock();
3220                         }
3221                         goto restart;
3222                 }
3223         }
3224         rcu_read_unlock();
3225
3226         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3227 }
3228
3229 void rt6_purge_dflt_routers(struct net *net)
3230 {
3231         struct fib6_table *table;
3232         struct hlist_head *head;
3233         unsigned int h;
3234
3235         rcu_read_lock();
3236
3237         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3238                 head = &net->ipv6.fib_table_hash[h];
3239                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3240                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3241                                 __rt6_purge_dflt_routers(table);
3242                 }
3243         }
3244
3245         rcu_read_unlock();
3246 }
3247
3248 static void rtmsg_to_fib6_config(struct net *net,
3249                                  struct in6_rtmsg *rtmsg,
3250                                  struct fib6_config *cfg)
3251 {
3252         memset(cfg, 0, sizeof(*cfg));
3253
3254         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3255                          : RT6_TABLE_MAIN;
3256         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3257         cfg->fc_metric = rtmsg->rtmsg_metric;
3258         cfg->fc_expires = rtmsg->rtmsg_info;
3259         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3260         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3261         cfg->fc_flags = rtmsg->rtmsg_flags;
3262
3263         cfg->fc_nlinfo.nl_net = net;
3264
3265         cfg->fc_dst = rtmsg->rtmsg_dst;
3266         cfg->fc_src = rtmsg->rtmsg_src;
3267         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3268 }
3269
3270 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3271 {
3272         struct fib6_config cfg;
3273         struct in6_rtmsg rtmsg;
3274         int err;
3275
3276         switch (cmd) {
3277         case SIOCADDRT:         /* Add a route */
3278         case SIOCDELRT:         /* Delete a route */
3279                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3280                         return -EPERM;
3281                 err = copy_from_user(&rtmsg, arg,
3282                                      sizeof(struct in6_rtmsg));
3283                 if (err)
3284                         return -EFAULT;
3285
3286                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3287
3288                 rtnl_lock();
3289                 switch (cmd) {
3290                 case SIOCADDRT:
3291                         err = ip6_route_add(&cfg, NULL);
3292                         break;
3293                 case SIOCDELRT:
3294                         err = ip6_route_del(&cfg, NULL);
3295                         break;
3296                 default:
3297                         err = -EINVAL;
3298                 }
3299                 rtnl_unlock();
3300
3301                 return err;
3302         }
3303
3304         return -EINVAL;
3305 }
3306
3307 /*
3308  *      Drop the packet on the floor
3309  */
3310
3311 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3312 {
3313         int type;
3314         struct dst_entry *dst = skb_dst(skb);
3315         switch (ipstats_mib_noroutes) {
3316         case IPSTATS_MIB_INNOROUTES:
3317                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3318                 if (type == IPV6_ADDR_ANY) {
3319                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3320                                       IPSTATS_MIB_INADDRERRORS);
3321                         break;
3322                 }
3323                 /* FALLTHROUGH */
3324         case IPSTATS_MIB_OUTNOROUTES:
3325                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3326                               ipstats_mib_noroutes);
3327                 break;
3328         }
3329         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3330         kfree_skb(skb);
3331         return 0;
3332 }
3333
3334 static int ip6_pkt_discard(struct sk_buff *skb)
3335 {
3336         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3337 }
3338
3339 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3340 {
3341         skb->dev = skb_dst(skb)->dev;
3342         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3343 }
3344
3345 static int ip6_pkt_prohibit(struct sk_buff *skb)
3346 {
3347         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3348 }
3349
3350 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3351 {
3352         skb->dev = skb_dst(skb)->dev;
3353         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3354 }
3355
3356 /*
3357  *      Allocate a dst for local (unicast / anycast) address.
3358  */
3359
3360 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3361                                     const struct in6_addr *addr,
3362                                     bool anycast)
3363 {
3364         u32 tb_id;
3365         struct net *net = dev_net(idev->dev);
3366         struct net_device *dev = idev->dev;
3367         struct rt6_info *rt;
3368
3369         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3370         if (!rt)
3371                 return ERR_PTR(-ENOMEM);
3372
3373         in6_dev_hold(idev);
3374
3375         rt->dst.flags |= DST_HOST;
3376         rt->dst.input = ip6_input;
3377         rt->dst.output = ip6_output;
3378         rt->rt6i_idev = idev;
3379
3380         rt->rt6i_protocol = RTPROT_KERNEL;
3381         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3382         if (anycast)
3383                 rt->rt6i_flags |= RTF_ANYCAST;
3384         else
3385                 rt->rt6i_flags |= RTF_LOCAL;
3386
3387         rt->rt6i_gateway  = *addr;
3388         rt->rt6i_dst.addr = *addr;
3389         rt->rt6i_dst.plen = 128;
3390         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3391         rt->rt6i_table = fib6_get_table(net, tb_id);
3392
3393         return rt;
3394 }
3395
3396 /* remove deleted ip from prefsrc entries */
3397 struct arg_dev_net_ip {
3398         struct net_device *dev;
3399         struct net *net;
3400         struct in6_addr *addr;
3401 };
3402
3403 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3404 {
3405         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3406         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3407         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3408
3409         if (((void *)rt->dst.dev == dev || !dev) &&
3410             rt != net->ipv6.ip6_null_entry &&
3411             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3412                 spin_lock_bh(&rt6_exception_lock);
3413                 /* remove prefsrc entry */
3414                 rt->rt6i_prefsrc.plen = 0;
3415                 /* need to update cache as well */
3416                 rt6_exceptions_remove_prefsrc(rt);
3417                 spin_unlock_bh(&rt6_exception_lock);
3418         }
3419         return 0;
3420 }
3421
3422 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3423 {
3424         struct net *net = dev_net(ifp->idev->dev);
3425         struct arg_dev_net_ip adni = {
3426                 .dev = ifp->idev->dev,
3427                 .net = net,
3428                 .addr = &ifp->addr,
3429         };
3430         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3431 }
3432
3433 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3434
3435 /* Remove routers and update dst entries when gateway turn into host. */
3436 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3437 {
3438         struct in6_addr *gateway = (struct in6_addr *)arg;
3439
3440         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3441             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3442                 return -1;
3443         }
3444
3445         /* Further clean up cached routes in exception table.
3446          * This is needed because cached route may have a different
3447          * gateway than its 'parent' in the case of an ip redirect.
3448          */
3449         rt6_exceptions_clean_tohost(rt, gateway);
3450
3451         return 0;
3452 }
3453
3454 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3455 {
3456         fib6_clean_all(net, fib6_clean_tohost, gateway);
3457 }
3458
3459 struct arg_dev_net {
3460         struct net_device *dev;
3461         struct net *net;
3462 };
3463
3464 /* called with write lock held for table with rt */
3465 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3466 {
3467         const struct arg_dev_net *adn = arg;
3468         const struct net_device *dev = adn->dev;
3469
3470         if ((rt->dst.dev == dev || !dev) &&
3471             rt != adn->net->ipv6.ip6_null_entry &&
3472             (rt->rt6i_nsiblings == 0 ||
3473              (dev && netdev_unregistering(dev)) ||
3474              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3475                 return -1;
3476
3477         return 0;
3478 }
3479
3480 void rt6_ifdown(struct net *net, struct net_device *dev)
3481 {
3482         struct arg_dev_net adn = {
3483                 .dev = dev,
3484                 .net = net,
3485         };
3486
3487         fib6_clean_all(net, fib6_ifdown, &adn);
3488         if (dev)
3489                 rt6_uncached_list_flush_dev(net, dev);
3490 }
3491
3492 struct rt6_mtu_change_arg {
3493         struct net_device *dev;
3494         unsigned int mtu;
3495 };
3496
3497 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3498 {
3499         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3500         struct inet6_dev *idev;
3501
3502         /* In IPv6 pmtu discovery is not optional,
3503            so that RTAX_MTU lock cannot disable it.
3504            We still use this lock to block changes
3505            caused by addrconf/ndisc.
3506         */
3507
3508         idev = __in6_dev_get(arg->dev);
3509         if (!idev)
3510                 return 0;
3511
3512         /* For administrative MTU increase, there is no way to discover
3513            IPv6 PMTU increase, so PMTU increase should be updated here.
3514            Since RFC 1981 doesn't include administrative MTU increase
3515            update PMTU increase is a MUST. (i.e. jumbo frame)
3516          */
3517         /*
3518            If new MTU is less than route PMTU, this new MTU will be the
3519            lowest MTU in the path, update the route PMTU to reflect PMTU
3520            decreases; if new MTU is greater than route PMTU, and the
3521            old MTU is the lowest MTU in the path, update the route PMTU
3522            to reflect the increase. In this case if the other nodes' MTU
3523            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3524            PMTU discovery.
3525          */
3526         if (rt->dst.dev == arg->dev &&
3527             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3528             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3529                 spin_lock_bh(&rt6_exception_lock);
3530                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3531                     (dst_mtu(&rt->dst) < arg->mtu &&
3532                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3533                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3534                 }
3535                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3536                 spin_unlock_bh(&rt6_exception_lock);
3537         }
3538         return 0;
3539 }
3540
3541 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3542 {
3543         struct rt6_mtu_change_arg arg = {
3544                 .dev = dev,
3545                 .mtu = mtu,
3546         };
3547
3548         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3549 }
3550
3551 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3552         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3553         [RTA_OIF]               = { .type = NLA_U32 },
3554         [RTA_IIF]               = { .type = NLA_U32 },
3555         [RTA_PRIORITY]          = { .type = NLA_U32 },
3556         [RTA_METRICS]           = { .type = NLA_NESTED },
3557         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3558         [RTA_PREF]              = { .type = NLA_U8 },
3559         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3560         [RTA_ENCAP]             = { .type = NLA_NESTED },
3561         [RTA_EXPIRES]           = { .type = NLA_U32 },
3562         [RTA_UID]               = { .type = NLA_U32 },
3563         [RTA_MARK]              = { .type = NLA_U32 },
3564 };
3565
3566 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3567                               struct fib6_config *cfg,
3568                               struct netlink_ext_ack *extack)
3569 {
3570         struct rtmsg *rtm;
3571         struct nlattr *tb[RTA_MAX+1];
3572         unsigned int pref;
3573         int err;
3574
3575         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3576                           NULL);
3577         if (err < 0)
3578                 goto errout;
3579
3580         err = -EINVAL;
3581         rtm = nlmsg_data(nlh);
3582         memset(cfg, 0, sizeof(*cfg));
3583
3584         cfg->fc_table = rtm->rtm_table;
3585         cfg->fc_dst_len = rtm->rtm_dst_len;
3586         cfg->fc_src_len = rtm->rtm_src_len;
3587         cfg->fc_flags = RTF_UP;
3588         cfg->fc_protocol = rtm->rtm_protocol;
3589         cfg->fc_type = rtm->rtm_type;
3590
3591         if (rtm->rtm_type == RTN_UNREACHABLE ||
3592             rtm->rtm_type == RTN_BLACKHOLE ||
3593             rtm->rtm_type == RTN_PROHIBIT ||
3594             rtm->rtm_type == RTN_THROW)
3595                 cfg->fc_flags |= RTF_REJECT;
3596
3597         if (rtm->rtm_type == RTN_LOCAL)
3598                 cfg->fc_flags |= RTF_LOCAL;
3599
3600         if (rtm->rtm_flags & RTM_F_CLONED)
3601                 cfg->fc_flags |= RTF_CACHE;
3602
3603         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3604         cfg->fc_nlinfo.nlh = nlh;
3605         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3606
3607         if (tb[RTA_GATEWAY]) {
3608                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3609                 cfg->fc_flags |= RTF_GATEWAY;
3610         }
3611
3612         if (tb[RTA_DST]) {
3613                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3614
3615                 if (nla_len(tb[RTA_DST]) < plen)
3616                         goto errout;
3617
3618                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3619         }
3620
3621         if (tb[RTA_SRC]) {
3622                 int plen = (rtm->rtm_src_len + 7) >> 3;
3623
3624                 if (nla_len(tb[RTA_SRC]) < plen)
3625                         goto errout;
3626
3627                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3628         }
3629
3630         if (tb[RTA_PREFSRC])
3631                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3632
3633         if (tb[RTA_OIF])
3634                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3635
3636         if (tb[RTA_PRIORITY])
3637                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3638
3639         if (tb[RTA_METRICS]) {
3640                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3641                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3642         }
3643
3644         if (tb[RTA_TABLE])
3645                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3646
3647         if (tb[RTA_MULTIPATH]) {
3648                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3649                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3650
3651                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3652                                                      cfg->fc_mp_len, extack);
3653                 if (err < 0)
3654                         goto errout;
3655         }
3656
3657         if (tb[RTA_PREF]) {
3658                 pref = nla_get_u8(tb[RTA_PREF]);
3659                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3660                     pref != ICMPV6_ROUTER_PREF_HIGH)
3661                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3662                 cfg->fc_flags |= RTF_PREF(pref);
3663         }
3664
3665         if (tb[RTA_ENCAP])
3666                 cfg->fc_encap = tb[RTA_ENCAP];
3667
3668         if (tb[RTA_ENCAP_TYPE]) {
3669                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3670
3671                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3672                 if (err < 0)
3673                         goto errout;
3674         }
3675
3676         if (tb[RTA_EXPIRES]) {
3677                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3678
3679                 if (addrconf_finite_timeout(timeout)) {
3680                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3681                         cfg->fc_flags |= RTF_EXPIRES;
3682                 }
3683         }
3684
3685         err = 0;
3686 errout:
3687         return err;
3688 }
3689
3690 struct rt6_nh {
3691         struct rt6_info *rt6_info;
3692         struct fib6_config r_cfg;
3693         struct mx6_config mxc;
3694         struct list_head next;
3695 };
3696
3697 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3698 {
3699         struct rt6_nh *nh;
3700
3701         list_for_each_entry(nh, rt6_nh_list, next) {
3702                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3703                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3704                         nh->r_cfg.fc_ifindex);
3705         }
3706 }
3707
3708 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3709                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3710 {
3711         struct rt6_nh *nh;
3712         int err = -EEXIST;
3713
3714         list_for_each_entry(nh, rt6_nh_list, next) {
3715                 /* check if rt6_info already exists */
3716                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3717                         return err;
3718         }
3719
3720         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3721         if (!nh)
3722                 return -ENOMEM;
3723         nh->rt6_info = rt;
3724         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3725         if (err) {
3726                 kfree(nh);
3727                 return err;
3728         }
3729         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3730         list_add_tail(&nh->next, rt6_nh_list);
3731
3732         return 0;
3733 }
3734
3735 static void ip6_route_mpath_notify(struct rt6_info *rt,
3736                                    struct rt6_info *rt_last,
3737                                    struct nl_info *info,
3738                                    __u16 nlflags)
3739 {
3740         /* if this is an APPEND route, then rt points to the first route
3741          * inserted and rt_last points to last route inserted. Userspace
3742          * wants a consistent dump of the route which starts at the first
3743          * nexthop. Since sibling routes are always added at the end of
3744          * the list, find the first sibling of the last route appended
3745          */
3746         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3747                 rt = list_first_entry(&rt_last->rt6i_siblings,
3748                                       struct rt6_info,
3749                                       rt6i_siblings);
3750         }
3751
3752         if (rt)
3753                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3754 }
3755
3756 static int ip6_route_multipath_add(struct fib6_config *cfg,
3757                                    struct netlink_ext_ack *extack)
3758 {
3759         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3760         struct nl_info *info = &cfg->fc_nlinfo;
3761         struct fib6_config r_cfg;
3762         struct rtnexthop *rtnh;
3763         struct rt6_info *rt;
3764         struct rt6_nh *err_nh;
3765         struct rt6_nh *nh, *nh_safe;
3766         __u16 nlflags;
3767         int remaining;
3768         int attrlen;
3769         int err = 1;
3770         int nhn = 0;
3771         int replace = (cfg->fc_nlinfo.nlh &&
3772                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3773         LIST_HEAD(rt6_nh_list);
3774
3775         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3776         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3777                 nlflags |= NLM_F_APPEND;
3778
3779         remaining = cfg->fc_mp_len;
3780         rtnh = (struct rtnexthop *)cfg->fc_mp;
3781
3782         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3783          * rt6_info structs per nexthop
3784          */
3785         while (rtnh_ok(rtnh, remaining)) {
3786                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3787                 if (rtnh->rtnh_ifindex)
3788                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3789
3790                 attrlen = rtnh_attrlen(rtnh);
3791                 if (attrlen > 0) {
3792                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3793
3794                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3795                         if (nla) {
3796                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3797                                 r_cfg.fc_flags |= RTF_GATEWAY;
3798                         }
3799                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3800                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3801                         if (nla)
3802                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3803                 }
3804
3805                 rt = ip6_route_info_create(&r_cfg, extack);
3806                 if (IS_ERR(rt)) {
3807                         err = PTR_ERR(rt);
3808                         rt = NULL;
3809                         goto cleanup;
3810                 }
3811
3812                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3813                 if (err) {
3814                         dst_release_immediate(&rt->dst);
3815                         goto cleanup;
3816                 }
3817
3818                 rtnh = rtnh_next(rtnh, &remaining);
3819         }
3820
3821         /* for add and replace send one notification with all nexthops.
3822          * Skip the notification in fib6_add_rt2node and send one with
3823          * the full route when done
3824          */
3825         info->skip_notify = 1;
3826
3827         err_nh = NULL;
3828         list_for_each_entry(nh, &rt6_nh_list, next) {
3829                 rt_last = nh->rt6_info;
3830                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3831                 /* save reference to first route for notification */
3832                 if (!rt_notif && !err)
3833                         rt_notif = nh->rt6_info;
3834
3835                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3836                 nh->rt6_info = NULL;
3837                 if (err) {
3838                         if (replace && nhn)
3839                                 ip6_print_replace_route_err(&rt6_nh_list);
3840                         err_nh = nh;
3841                         goto add_errout;
3842                 }
3843
3844                 /* Because each route is added like a single route we remove
3845                  * these flags after the first nexthop: if there is a collision,
3846                  * we have already failed to add the first nexthop:
3847                  * fib6_add_rt2node() has rejected it; when replacing, old
3848                  * nexthops have been replaced by first new, the rest should
3849                  * be added to it.
3850                  */
3851                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3852                                                      NLM_F_REPLACE);
3853                 nhn++;
3854         }
3855
3856         /* success ... tell user about new route */
3857         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3858         goto cleanup;
3859
3860 add_errout:
3861         /* send notification for routes that were added so that
3862          * the delete notifications sent by ip6_route_del are
3863          * coherent
3864          */
3865         if (rt_notif)
3866                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3867
3868         /* Delete routes that were already added */
3869         list_for_each_entry(nh, &rt6_nh_list, next) {
3870                 if (err_nh == nh)
3871                         break;
3872                 ip6_route_del(&nh->r_cfg, extack);
3873         }
3874
3875 cleanup:
3876         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3877                 if (nh->rt6_info)
3878                         dst_release_immediate(&nh->rt6_info->dst);
3879                 kfree(nh->mxc.mx);
3880                 list_del(&nh->next);
3881                 kfree(nh);
3882         }
3883
3884         return err;
3885 }
3886
3887 static int ip6_route_multipath_del(struct fib6_config *cfg,
3888                                    struct netlink_ext_ack *extack)
3889 {
3890         struct fib6_config r_cfg;
3891         struct rtnexthop *rtnh;
3892         int remaining;
3893         int attrlen;
3894         int err = 1, last_err = 0;
3895
3896         remaining = cfg->fc_mp_len;
3897         rtnh = (struct rtnexthop *)cfg->fc_mp;
3898
3899         /* Parse a Multipath Entry */
3900         while (rtnh_ok(rtnh, remaining)) {
3901                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3902                 if (rtnh->rtnh_ifindex)
3903                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3904
3905                 attrlen = rtnh_attrlen(rtnh);
3906                 if (attrlen > 0) {
3907                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3908
3909                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3910                         if (nla) {
3911                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3912                                 r_cfg.fc_flags |= RTF_GATEWAY;
3913                         }
3914                 }
3915                 err = ip6_route_del(&r_cfg, extack);
3916                 if (err)
3917                         last_err = err;
3918
3919                 rtnh = rtnh_next(rtnh, &remaining);
3920         }
3921
3922         return last_err;
3923 }
3924
3925 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3926                               struct netlink_ext_ack *extack)
3927 {
3928         struct fib6_config cfg;
3929         int err;
3930
3931         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3932         if (err < 0)
3933                 return err;
3934
3935         if (cfg.fc_mp)
3936                 return ip6_route_multipath_del(&cfg, extack);
3937         else {
3938                 cfg.fc_delete_all_nh = 1;
3939                 return ip6_route_del(&cfg, extack);
3940         }
3941 }
3942
3943 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3944                               struct netlink_ext_ack *extack)
3945 {
3946         struct fib6_config cfg;
3947         int err;
3948
3949         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3950         if (err < 0)
3951                 return err;
3952
3953         if (cfg.fc_mp)
3954                 return ip6_route_multipath_add(&cfg, extack);
3955         else
3956                 return ip6_route_add(&cfg, extack);
3957 }
3958
3959 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3960 {
3961         int nexthop_len = 0;
3962
3963         if (rt->rt6i_nsiblings) {
3964                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3965                             + NLA_ALIGN(sizeof(struct rtnexthop))
3966                             + nla_total_size(16) /* RTA_GATEWAY */
3967                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3968
3969                 nexthop_len *= rt->rt6i_nsiblings;
3970         }
3971
3972         return NLMSG_ALIGN(sizeof(struct rtmsg))
3973                + nla_total_size(16) /* RTA_SRC */
3974                + nla_total_size(16) /* RTA_DST */
3975                + nla_total_size(16) /* RTA_GATEWAY */
3976                + nla_total_size(16) /* RTA_PREFSRC */
3977                + nla_total_size(4) /* RTA_TABLE */
3978                + nla_total_size(4) /* RTA_IIF */
3979                + nla_total_size(4) /* RTA_OIF */
3980                + nla_total_size(4) /* RTA_PRIORITY */
3981                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3982                + nla_total_size(sizeof(struct rta_cacheinfo))
3983                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3984                + nla_total_size(1) /* RTA_PREF */
3985                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3986                + nexthop_len;
3987 }
3988
3989 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3990                             unsigned int *flags, bool skip_oif)
3991 {
3992         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3993                 *flags |= RTNH_F_LINKDOWN;
3994                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3995                         *flags |= RTNH_F_DEAD;
3996         }
3997
3998         if (rt->rt6i_flags & RTF_GATEWAY) {
3999                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4000                         goto nla_put_failure;
4001         }
4002
4003         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4004                 *flags |= RTNH_F_OFFLOAD;
4005
4006         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4007         if (!skip_oif && rt->dst.dev &&
4008             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4009                 goto nla_put_failure;
4010
4011         if (rt->dst.lwtstate &&
4012             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4013                 goto nla_put_failure;
4014
4015         return 0;
4016
4017 nla_put_failure:
4018         return -EMSGSIZE;
4019 }
4020
4021 /* add multipath next hop */
4022 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4023 {
4024         struct rtnexthop *rtnh;
4025         unsigned int flags = 0;
4026
4027         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4028         if (!rtnh)
4029                 goto nla_put_failure;
4030
4031         rtnh->rtnh_hops = 0;
4032         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4033
4034         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4035                 goto nla_put_failure;
4036
4037         rtnh->rtnh_flags = flags;
4038
4039         /* length of rtnetlink header + attributes */
4040         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4041
4042         return 0;
4043
4044 nla_put_failure:
4045         return -EMSGSIZE;
4046 }
4047
4048 static int rt6_fill_node(struct net *net,
4049                          struct sk_buff *skb, struct rt6_info *rt,
4050                          struct in6_addr *dst, struct in6_addr *src,
4051                          int iif, int type, u32 portid, u32 seq,
4052                          unsigned int flags)
4053 {
4054         u32 metrics[RTAX_MAX];
4055         struct rtmsg *rtm;
4056         struct nlmsghdr *nlh;
4057         long expires;
4058         u32 table;
4059
4060         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4061         if (!nlh)
4062                 return -EMSGSIZE;
4063
4064         rtm = nlmsg_data(nlh);
4065         rtm->rtm_family = AF_INET6;
4066         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4067         rtm->rtm_src_len = rt->rt6i_src.plen;
4068         rtm->rtm_tos = 0;
4069         if (rt->rt6i_table)
4070                 table = rt->rt6i_table->tb6_id;
4071         else
4072                 table = RT6_TABLE_UNSPEC;
4073         rtm->rtm_table = table;
4074         if (nla_put_u32(skb, RTA_TABLE, table))
4075                 goto nla_put_failure;
4076         if (rt->rt6i_flags & RTF_REJECT) {
4077                 switch (rt->dst.error) {
4078                 case -EINVAL:
4079                         rtm->rtm_type = RTN_BLACKHOLE;
4080                         break;
4081                 case -EACCES:
4082                         rtm->rtm_type = RTN_PROHIBIT;
4083                         break;
4084                 case -EAGAIN:
4085                         rtm->rtm_type = RTN_THROW;
4086                         break;
4087                 default:
4088                         rtm->rtm_type = RTN_UNREACHABLE;
4089                         break;
4090                 }
4091         }
4092         else if (rt->rt6i_flags & RTF_LOCAL)
4093                 rtm->rtm_type = RTN_LOCAL;
4094         else if (rt->rt6i_flags & RTF_ANYCAST)
4095                 rtm->rtm_type = RTN_ANYCAST;
4096         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4097                 rtm->rtm_type = RTN_LOCAL;
4098         else
4099                 rtm->rtm_type = RTN_UNICAST;
4100         rtm->rtm_flags = 0;
4101         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4102         rtm->rtm_protocol = rt->rt6i_protocol;
4103
4104         if (rt->rt6i_flags & RTF_CACHE)
4105                 rtm->rtm_flags |= RTM_F_CLONED;
4106
4107         if (dst) {
4108                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4109                         goto nla_put_failure;
4110                 rtm->rtm_dst_len = 128;
4111         } else if (rtm->rtm_dst_len)
4112                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4113                         goto nla_put_failure;
4114 #ifdef CONFIG_IPV6_SUBTREES
4115         if (src) {
4116                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4117                         goto nla_put_failure;
4118                 rtm->rtm_src_len = 128;
4119         } else if (rtm->rtm_src_len &&
4120                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4121                 goto nla_put_failure;
4122 #endif
4123         if (iif) {
4124 #ifdef CONFIG_IPV6_MROUTE
4125                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4126                         int err = ip6mr_get_route(net, skb, rtm, portid);
4127
4128                         if (err == 0)
4129                                 return 0;
4130                         if (err < 0)
4131                                 goto nla_put_failure;
4132                 } else
4133 #endif
4134                         if (nla_put_u32(skb, RTA_IIF, iif))
4135                                 goto nla_put_failure;
4136         } else if (dst) {
4137                 struct in6_addr saddr_buf;
4138                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4139                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4140                         goto nla_put_failure;
4141         }
4142
4143         if (rt->rt6i_prefsrc.plen) {
4144                 struct in6_addr saddr_buf;
4145                 saddr_buf = rt->rt6i_prefsrc.addr;
4146                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4147                         goto nla_put_failure;
4148         }
4149
4150         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4151         if (rt->rt6i_pmtu)
4152                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4153         if (rtnetlink_put_metrics(skb, metrics) < 0)
4154                 goto nla_put_failure;
4155
4156         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4157                 goto nla_put_failure;
4158
4159         /* For multipath routes, walk the siblings list and add
4160          * each as a nexthop within RTA_MULTIPATH.
4161          */
4162         if (rt->rt6i_nsiblings) {
4163                 struct rt6_info *sibling, *next_sibling;
4164                 struct nlattr *mp;
4165
4166                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4167                 if (!mp)
4168                         goto nla_put_failure;
4169
4170                 if (rt6_add_nexthop(skb, rt) < 0)
4171                         goto nla_put_failure;
4172
4173                 list_for_each_entry_safe(sibling, next_sibling,
4174                                          &rt->rt6i_siblings, rt6i_siblings) {
4175                         if (rt6_add_nexthop(skb, sibling) < 0)
4176                                 goto nla_put_failure;
4177                 }
4178
4179                 nla_nest_end(skb, mp);
4180         } else {
4181                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4182                         goto nla_put_failure;
4183         }
4184
4185         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4186
4187         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4188                 goto nla_put_failure;
4189
4190         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4191                 goto nla_put_failure;
4192
4193
4194         nlmsg_end(skb, nlh);
4195         return 0;
4196
4197 nla_put_failure:
4198         nlmsg_cancel(skb, nlh);
4199         return -EMSGSIZE;
4200 }
4201
4202 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4203 {
4204         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4205         struct net *net = arg->net;
4206
4207         if (rt == net->ipv6.ip6_null_entry)
4208                 return 0;
4209
4210         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4211                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4212
4213                 /* user wants prefix routes only */
4214                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4215                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4216                         /* success since this is not a prefix route */
4217                         return 1;
4218                 }
4219         }
4220
4221         return rt6_fill_node(net,
4222                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4223                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4224                      NLM_F_MULTI);
4225 }
4226
4227 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4228                               struct netlink_ext_ack *extack)
4229 {
4230         struct net *net = sock_net(in_skb->sk);
4231         struct nlattr *tb[RTA_MAX+1];
4232         int err, iif = 0, oif = 0;
4233         struct dst_entry *dst;
4234         struct rt6_info *rt;
4235         struct sk_buff *skb;
4236         struct rtmsg *rtm;
4237         struct flowi6 fl6;
4238         bool fibmatch;
4239
4240         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4241                           extack);
4242         if (err < 0)
4243                 goto errout;
4244
4245         err = -EINVAL;
4246         memset(&fl6, 0, sizeof(fl6));
4247         rtm = nlmsg_data(nlh);
4248         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4249         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4250
4251         if (tb[RTA_SRC]) {
4252                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4253                         goto errout;
4254
4255                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4256         }
4257
4258         if (tb[RTA_DST]) {
4259                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4260                         goto errout;
4261
4262                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4263         }
4264
4265         if (tb[RTA_IIF])
4266                 iif = nla_get_u32(tb[RTA_IIF]);
4267
4268         if (tb[RTA_OIF])
4269                 oif = nla_get_u32(tb[RTA_OIF]);
4270
4271         if (tb[RTA_MARK])
4272                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4273
4274         if (tb[RTA_UID])
4275                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4276                                            nla_get_u32(tb[RTA_UID]));
4277         else
4278                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4279
4280         if (iif) {
4281                 struct net_device *dev;
4282                 int flags = 0;
4283
4284                 rcu_read_lock();
4285
4286                 dev = dev_get_by_index_rcu(net, iif);
4287                 if (!dev) {
4288                         rcu_read_unlock();
4289                         err = -ENODEV;
4290                         goto errout;
4291                 }
4292
4293                 fl6.flowi6_iif = iif;
4294
4295                 if (!ipv6_addr_any(&fl6.saddr))
4296                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4297
4298                 if (!fibmatch)
4299                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4300                 else
4301                         dst = ip6_route_lookup(net, &fl6, 0);
4302
4303                 rcu_read_unlock();
4304         } else {
4305                 fl6.flowi6_oif = oif;
4306
4307                 if (!fibmatch)
4308                         dst = ip6_route_output(net, NULL, &fl6);
4309                 else
4310                         dst = ip6_route_lookup(net, &fl6, 0);
4311         }
4312
4313
4314         rt = container_of(dst, struct rt6_info, dst);
4315         if (rt->dst.error) {
4316                 err = rt->dst.error;
4317                 ip6_rt_put(rt);
4318                 goto errout;
4319         }
4320
4321         if (rt == net->ipv6.ip6_null_entry) {
4322                 err = rt->dst.error;
4323                 ip6_rt_put(rt);
4324                 goto errout;
4325         }
4326
4327         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4328         if (!skb) {
4329                 ip6_rt_put(rt);
4330                 err = -ENOBUFS;
4331                 goto errout;
4332         }
4333
4334         skb_dst_set(skb, &rt->dst);
4335         if (fibmatch)
4336                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4337                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4338                                     nlh->nlmsg_seq, 0);
4339         else
4340                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4341                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4342                                     nlh->nlmsg_seq, 0);
4343         if (err < 0) {
4344                 kfree_skb(skb);
4345                 goto errout;
4346         }
4347
4348         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4349 errout:
4350         return err;
4351 }
4352
4353 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4354                      unsigned int nlm_flags)
4355 {
4356         struct sk_buff *skb;
4357         struct net *net = info->nl_net;
4358         u32 seq;
4359         int err;
4360
4361         err = -ENOBUFS;
4362         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4363
4364         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4365         if (!skb)
4366                 goto errout;
4367
4368         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4369                                 event, info->portid, seq, nlm_flags);
4370         if (err < 0) {
4371                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4372                 WARN_ON(err == -EMSGSIZE);
4373                 kfree_skb(skb);
4374                 goto errout;
4375         }
4376         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4377                     info->nlh, gfp_any());
4378         return;
4379 errout:
4380         if (err < 0)
4381                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4382 }
4383
4384 static int ip6_route_dev_notify(struct notifier_block *this,
4385                                 unsigned long event, void *ptr)
4386 {
4387         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4388         struct net *net = dev_net(dev);
4389
4390         if (!(dev->flags & IFF_LOOPBACK))
4391                 return NOTIFY_OK;
4392
4393         if (event == NETDEV_REGISTER) {
4394                 net->ipv6.ip6_null_entry->dst.dev = dev;
4395                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4396 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4397                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4398                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4399                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4400                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4401 #endif
4402          } else if (event == NETDEV_UNREGISTER &&
4403                     dev->reg_state != NETREG_UNREGISTERED) {
4404                 /* NETDEV_UNREGISTER could be fired for multiple times by
4405                  * netdev_wait_allrefs(). Make sure we only call this once.
4406                  */
4407                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4408 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4409                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4410                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4411 #endif
4412         }
4413
4414         return NOTIFY_OK;
4415 }
4416
4417 /*
4418  *      /proc
4419  */
4420
4421 #ifdef CONFIG_PROC_FS
4422
4423 static const struct file_operations ipv6_route_proc_fops = {
4424         .owner          = THIS_MODULE,
4425         .open           = ipv6_route_open,
4426         .read           = seq_read,
4427         .llseek         = seq_lseek,
4428         .release        = seq_release_net,
4429 };
4430
4431 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4432 {
4433         struct net *net = (struct net *)seq->private;
4434         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4435                    net->ipv6.rt6_stats->fib_nodes,
4436                    net->ipv6.rt6_stats->fib_route_nodes,
4437                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4438                    net->ipv6.rt6_stats->fib_rt_entries,
4439                    net->ipv6.rt6_stats->fib_rt_cache,
4440                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4441                    net->ipv6.rt6_stats->fib_discarded_routes);
4442
4443         return 0;
4444 }
4445
4446 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4447 {
4448         return single_open_net(inode, file, rt6_stats_seq_show);
4449 }
4450
4451 static const struct file_operations rt6_stats_seq_fops = {
4452         .owner   = THIS_MODULE,
4453         .open    = rt6_stats_seq_open,
4454         .read    = seq_read,
4455         .llseek  = seq_lseek,
4456         .release = single_release_net,
4457 };
4458 #endif  /* CONFIG_PROC_FS */
4459
4460 #ifdef CONFIG_SYSCTL
4461
4462 static
4463 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4464                               void __user *buffer, size_t *lenp, loff_t *ppos)
4465 {
4466         struct net *net;
4467         int delay;
4468         if (!write)
4469                 return -EINVAL;
4470
4471         net = (struct net *)ctl->extra1;
4472         delay = net->ipv6.sysctl.flush_delay;
4473         proc_dointvec(ctl, write, buffer, lenp, ppos);
4474         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4475         return 0;
4476 }
4477
4478 struct ctl_table ipv6_route_table_template[] = {
4479         {
4480                 .procname       =       "flush",
4481                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4482                 .maxlen         =       sizeof(int),
4483                 .mode           =       0200,
4484                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4485         },
4486         {
4487                 .procname       =       "gc_thresh",
4488                 .data           =       &ip6_dst_ops_template.gc_thresh,
4489                 .maxlen         =       sizeof(int),
4490                 .mode           =       0644,
4491                 .proc_handler   =       proc_dointvec,
4492         },
4493         {
4494                 .procname       =       "max_size",
4495                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4496                 .maxlen         =       sizeof(int),
4497                 .mode           =       0644,
4498                 .proc_handler   =       proc_dointvec,
4499         },
4500         {
4501                 .procname       =       "gc_min_interval",
4502                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4503                 .maxlen         =       sizeof(int),
4504                 .mode           =       0644,
4505                 .proc_handler   =       proc_dointvec_jiffies,
4506         },
4507         {
4508                 .procname       =       "gc_timeout",
4509                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4510                 .maxlen         =       sizeof(int),
4511                 .mode           =       0644,
4512                 .proc_handler   =       proc_dointvec_jiffies,
4513         },
4514         {
4515                 .procname       =       "gc_interval",
4516                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4517                 .maxlen         =       sizeof(int),
4518                 .mode           =       0644,
4519                 .proc_handler   =       proc_dointvec_jiffies,
4520         },
4521         {
4522                 .procname       =       "gc_elasticity",
4523                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4524                 .maxlen         =       sizeof(int),
4525                 .mode           =       0644,
4526                 .proc_handler   =       proc_dointvec,
4527         },
4528         {
4529                 .procname       =       "mtu_expires",
4530                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4531                 .maxlen         =       sizeof(int),
4532                 .mode           =       0644,
4533                 .proc_handler   =       proc_dointvec_jiffies,
4534         },
4535         {
4536                 .procname       =       "min_adv_mss",
4537                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4538                 .maxlen         =       sizeof(int),
4539                 .mode           =       0644,
4540                 .proc_handler   =       proc_dointvec,
4541         },
4542         {
4543                 .procname       =       "gc_min_interval_ms",
4544                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4545                 .maxlen         =       sizeof(int),
4546                 .mode           =       0644,
4547                 .proc_handler   =       proc_dointvec_ms_jiffies,
4548         },
4549         { }
4550 };
4551
4552 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4553 {
4554         struct ctl_table *table;
4555
4556         table = kmemdup(ipv6_route_table_template,
4557                         sizeof(ipv6_route_table_template),
4558                         GFP_KERNEL);
4559
4560         if (table) {
4561                 table[0].data = &net->ipv6.sysctl.flush_delay;
4562                 table[0].extra1 = net;
4563                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4564                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4565                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4566                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4567                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4568                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4569                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4570                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4571                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4572
4573                 /* Don't export sysctls to unprivileged users */
4574                 if (net->user_ns != &init_user_ns)
4575                         table[0].procname = NULL;
4576         }
4577
4578         return table;
4579 }
4580 #endif
4581
4582 static int __net_init ip6_route_net_init(struct net *net)
4583 {
4584         int ret = -ENOMEM;
4585
4586         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4587                sizeof(net->ipv6.ip6_dst_ops));
4588
4589         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4590                 goto out_ip6_dst_ops;
4591
4592         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4593                                            sizeof(*net->ipv6.ip6_null_entry),
4594                                            GFP_KERNEL);
4595         if (!net->ipv6.ip6_null_entry)
4596                 goto out_ip6_dst_entries;
4597         net->ipv6.ip6_null_entry->dst.path =
4598                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4599         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4600         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4601                          ip6_template_metrics, true);
4602
4603 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4604         net->ipv6.fib6_has_custom_rules = false;
4605         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4606                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4607                                                GFP_KERNEL);
4608         if (!net->ipv6.ip6_prohibit_entry)
4609                 goto out_ip6_null_entry;
4610         net->ipv6.ip6_prohibit_entry->dst.path =
4611                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4612         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4613         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4614                          ip6_template_metrics, true);
4615
4616         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4617                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4618                                                GFP_KERNEL);
4619         if (!net->ipv6.ip6_blk_hole_entry)
4620                 goto out_ip6_prohibit_entry;
4621         net->ipv6.ip6_blk_hole_entry->dst.path =
4622                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4623         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4624         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4625                          ip6_template_metrics, true);
4626 #endif
4627
4628         net->ipv6.sysctl.flush_delay = 0;
4629         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4630         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4631         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4632         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4633         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4634         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4635         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4636
4637         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4638
4639         ret = 0;
4640 out:
4641         return ret;
4642
4643 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4644 out_ip6_prohibit_entry:
4645         kfree(net->ipv6.ip6_prohibit_entry);
4646 out_ip6_null_entry:
4647         kfree(net->ipv6.ip6_null_entry);
4648 #endif
4649 out_ip6_dst_entries:
4650         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4651 out_ip6_dst_ops:
4652         goto out;
4653 }
4654
4655 static void __net_exit ip6_route_net_exit(struct net *net)
4656 {
4657         kfree(net->ipv6.ip6_null_entry);
4658 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4659         kfree(net->ipv6.ip6_prohibit_entry);
4660         kfree(net->ipv6.ip6_blk_hole_entry);
4661 #endif
4662         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4663 }
4664
4665 static int __net_init ip6_route_net_init_late(struct net *net)
4666 {
4667 #ifdef CONFIG_PROC_FS
4668         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4669         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4670 #endif
4671         return 0;
4672 }
4673
4674 static void __net_exit ip6_route_net_exit_late(struct net *net)
4675 {
4676 #ifdef CONFIG_PROC_FS
4677         remove_proc_entry("ipv6_route", net->proc_net);
4678         remove_proc_entry("rt6_stats", net->proc_net);
4679 #endif
4680 }
4681
4682 static struct pernet_operations ip6_route_net_ops = {
4683         .init = ip6_route_net_init,
4684         .exit = ip6_route_net_exit,
4685 };
4686
4687 static int __net_init ipv6_inetpeer_init(struct net *net)
4688 {
4689         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4690
4691         if (!bp)
4692                 return -ENOMEM;
4693         inet_peer_base_init(bp);
4694         net->ipv6.peers = bp;
4695         return 0;
4696 }
4697
4698 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4699 {
4700         struct inet_peer_base *bp = net->ipv6.peers;
4701
4702         net->ipv6.peers = NULL;
4703         inetpeer_invalidate_tree(bp);
4704         kfree(bp);
4705 }
4706
4707 static struct pernet_operations ipv6_inetpeer_ops = {
4708         .init   =       ipv6_inetpeer_init,
4709         .exit   =       ipv6_inetpeer_exit,
4710 };
4711
4712 static struct pernet_operations ip6_route_net_late_ops = {
4713         .init = ip6_route_net_init_late,
4714         .exit = ip6_route_net_exit_late,
4715 };
4716
4717 static struct notifier_block ip6_route_dev_notifier = {
4718         .notifier_call = ip6_route_dev_notify,
4719         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4720 };
4721
4722 void __init ip6_route_init_special_entries(void)
4723 {
4724         /* Registering of the loopback is done before this portion of code,
4725          * the loopback reference in rt6_info will not be taken, do it
4726          * manually for init_net */
4727         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4728         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4729   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4730         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4731         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4732         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4733         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4734   #endif
4735 }
4736
4737 int __init ip6_route_init(void)
4738 {
4739         int ret;
4740         int cpu;
4741
4742         ret = -ENOMEM;
4743         ip6_dst_ops_template.kmem_cachep =
4744                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4745                                   SLAB_HWCACHE_ALIGN, NULL);
4746         if (!ip6_dst_ops_template.kmem_cachep)
4747                 goto out;
4748
4749         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4750         if (ret)
4751                 goto out_kmem_cache;
4752
4753         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4754         if (ret)
4755                 goto out_dst_entries;
4756
4757         ret = register_pernet_subsys(&ip6_route_net_ops);
4758         if (ret)
4759                 goto out_register_inetpeer;
4760
4761         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4762
4763         ret = fib6_init();
4764         if (ret)
4765                 goto out_register_subsys;
4766
4767         ret = xfrm6_init();
4768         if (ret)
4769                 goto out_fib6_init;
4770
4771         ret = fib6_rules_init();
4772         if (ret)
4773                 goto xfrm6_init;
4774
4775         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4776         if (ret)
4777                 goto fib6_rules_init;
4778
4779         ret = -ENOBUFS;
4780         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4781             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4782             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4783                             RTNL_FLAG_DOIT_UNLOCKED))
4784                 goto out_register_late_subsys;
4785
4786         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4787         if (ret)
4788                 goto out_register_late_subsys;
4789
4790         for_each_possible_cpu(cpu) {
4791                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4792
4793                 INIT_LIST_HEAD(&ul->head);
4794                 spin_lock_init(&ul->lock);
4795         }
4796
4797 out:
4798         return ret;
4799
4800 out_register_late_subsys:
4801         unregister_pernet_subsys(&ip6_route_net_late_ops);
4802 fib6_rules_init:
4803         fib6_rules_cleanup();
4804 xfrm6_init:
4805         xfrm6_fini();
4806 out_fib6_init:
4807         fib6_gc_cleanup();
4808 out_register_subsys:
4809         unregister_pernet_subsys(&ip6_route_net_ops);
4810 out_register_inetpeer:
4811         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4812 out_dst_entries:
4813         dst_entries_destroy(&ip6_dst_blackhole_ops);
4814 out_kmem_cache:
4815         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4816         goto out;
4817 }
4818
4819 void ip6_route_cleanup(void)
4820 {
4821         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4822         unregister_pernet_subsys(&ip6_route_net_late_ops);
4823         fib6_rules_cleanup();
4824         xfrm6_fini();
4825         fib6_gc_cleanup();
4826         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4827         unregister_pernet_subsys(&ip6_route_net_ops);
4828         dst_entries_destroy(&ip6_dst_blackhole_ops);
4829         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4830 }