]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv4/route.c
inet: switch IP ID generator to siphash
[linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115
116 #include "fib_lookup.h"
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_seq_fops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_seq_fops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         struct net_device *dev = dst->dev;
438         const __be32 *pkey = daddr;
439         const struct rtable *rt;
440         struct neighbour *n;
441
442         rt = (const struct rtable *) dst;
443         if (rt->rt_gateway)
444                 pkey = (const __be32 *) &rt->rt_gateway;
445         else if (skb)
446                 pkey = &ip_hdr(skb)->daddr;
447
448         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
449         if (n)
450                 return n;
451         return neigh_create(&arp_tbl, pkey, dev);
452 }
453
454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
455 {
456         struct net_device *dev = dst->dev;
457         const __be32 *pkey = daddr;
458         const struct rtable *rt;
459
460         rt = (const struct rtable *)dst;
461         if (rt->rt_gateway)
462                 pkey = (const __be32 *)&rt->rt_gateway;
463         else if (!daddr ||
464                  (rt->rt_flags &
465                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
466                 return;
467
468         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
469 }
470
471 #define IP_IDENTS_SZ 2048u
472
473 static atomic_t *ip_idents __read_mostly;
474 static u32 *ip_tstamps __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
484         u32 old = READ_ONCE(*p_tstamp);
485         u32 now = (u32)jiffies;
486         u32 new, delta = 0;
487
488         if (old != now && cmpxchg(p_tstamp, old, now) == old)
489                 delta = prandom_u32_max(now - old);
490
491         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
492         do {
493                 old = (u32)atomic_read(p_id);
494                 new = old + delta + segs;
495         } while (atomic_cmpxchg(p_id, old, new) != old);
496
497         return new - segs;
498 }
499 EXPORT_SYMBOL(ip_idents_reserve);
500
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
502 {
503         u32 hash, id;
504
505         /* Note the following code is not safe, but this is okay. */
506         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
507                 get_random_bytes(&net->ipv4.ip_id_key,
508                                  sizeof(net->ipv4.ip_id_key));
509
510         hash = siphash_3u32((__force u32)iph->daddr,
511                             (__force u32)iph->saddr,
512                             iph->protocol,
513                             &net->ipv4.ip_id_key);
514         id = ip_idents_reserve(hash, segs);
515         iph->id = htons(id);
516 }
517 EXPORT_SYMBOL(__ip_select_ident);
518
519 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
520                              const struct sock *sk,
521                              const struct iphdr *iph,
522                              int oif, u8 tos,
523                              u8 prot, u32 mark, int flow_flags)
524 {
525         if (sk) {
526                 const struct inet_sock *inet = inet_sk(sk);
527
528                 oif = sk->sk_bound_dev_if;
529                 mark = sk->sk_mark;
530                 tos = RT_CONN_FLAGS(sk);
531                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
532         }
533         flowi4_init_output(fl4, oif, mark, tos,
534                            RT_SCOPE_UNIVERSE, prot,
535                            flow_flags,
536                            iph->daddr, iph->saddr, 0, 0,
537                            sock_net_uid(net, sk));
538 }
539
540 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
541                                const struct sock *sk)
542 {
543         const struct net *net = dev_net(skb->dev);
544         const struct iphdr *iph = ip_hdr(skb);
545         int oif = skb->dev->ifindex;
546         u8 tos = RT_TOS(iph->tos);
547         u8 prot = iph->protocol;
548         u32 mark = skb->mark;
549
550         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
551 }
552
553 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
554 {
555         const struct inet_sock *inet = inet_sk(sk);
556         const struct ip_options_rcu *inet_opt;
557         __be32 daddr = inet->inet_daddr;
558
559         rcu_read_lock();
560         inet_opt = rcu_dereference(inet->inet_opt);
561         if (inet_opt && inet_opt->opt.srr)
562                 daddr = inet_opt->opt.faddr;
563         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
564                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
565                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
566                            inet_sk_flowi_flags(sk),
567                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
568         rcu_read_unlock();
569 }
570
571 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
572                                  const struct sk_buff *skb)
573 {
574         if (skb)
575                 build_skb_flow_key(fl4, skb, sk);
576         else
577                 build_sk_flow_key(fl4, sk);
578 }
579
580 static DEFINE_SPINLOCK(fnhe_lock);
581
582 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
583 {
584         struct rtable *rt;
585
586         rt = rcu_dereference(fnhe->fnhe_rth_input);
587         if (rt) {
588                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
589                 dst_dev_put(&rt->dst);
590                 dst_release(&rt->dst);
591         }
592         rt = rcu_dereference(fnhe->fnhe_rth_output);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598 }
599
600 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
601 {
602         struct fib_nh_exception *fnhe, *oldest;
603
604         oldest = rcu_dereference(hash->chain);
605         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
606              fnhe = rcu_dereference(fnhe->fnhe_next)) {
607                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608                         oldest = fnhe;
609         }
610         fnhe_flush_routes(oldest);
611         return oldest;
612 }
613
614 static inline u32 fnhe_hashfun(__be32 daddr)
615 {
616         static u32 fnhe_hashrnd __read_mostly;
617         u32 hval;
618
619         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
620         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
621         return hash_32(hval, FNHE_HASH_SHIFT);
622 }
623
624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
625 {
626         rt->rt_pmtu = fnhe->fnhe_pmtu;
627         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
628         rt->dst.expires = fnhe->fnhe_expires;
629
630         if (fnhe->fnhe_gw) {
631                 rt->rt_flags |= RTCF_REDIRECTED;
632                 rt->rt_gateway = fnhe->fnhe_gw;
633                 rt->rt_uses_gateway = 1;
634         }
635 }
636
637 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
638                                   u32 pmtu, bool lock, unsigned long expires)
639 {
640         struct fnhe_hash_bucket *hash;
641         struct fib_nh_exception *fnhe;
642         struct rtable *rt;
643         u32 genid, hval;
644         unsigned int i;
645         int depth;
646
647         genid = fnhe_genid(dev_net(nh->nh_dev));
648         hval = fnhe_hashfun(daddr);
649
650         spin_lock_bh(&fnhe_lock);
651
652         hash = rcu_dereference(nh->nh_exceptions);
653         if (!hash) {
654                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
655                 if (!hash)
656                         goto out_unlock;
657                 rcu_assign_pointer(nh->nh_exceptions, hash);
658         }
659
660         hash += hval;
661
662         depth = 0;
663         for (fnhe = rcu_dereference(hash->chain); fnhe;
664              fnhe = rcu_dereference(fnhe->fnhe_next)) {
665                 if (fnhe->fnhe_daddr == daddr)
666                         break;
667                 depth++;
668         }
669
670         if (fnhe) {
671                 if (fnhe->fnhe_genid != genid)
672                         fnhe->fnhe_genid = genid;
673                 if (gw)
674                         fnhe->fnhe_gw = gw;
675                 if (pmtu) {
676                         fnhe->fnhe_pmtu = pmtu;
677                         fnhe->fnhe_mtu_locked = lock;
678                 }
679                 fnhe->fnhe_expires = max(1UL, expires);
680                 /* Update all cached dsts too */
681                 rt = rcu_dereference(fnhe->fnhe_rth_input);
682                 if (rt)
683                         fill_route_from_fnhe(rt, fnhe);
684                 rt = rcu_dereference(fnhe->fnhe_rth_output);
685                 if (rt)
686                         fill_route_from_fnhe(rt, fnhe);
687         } else {
688                 if (depth > FNHE_RECLAIM_DEPTH)
689                         fnhe = fnhe_oldest(hash);
690                 else {
691                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
692                         if (!fnhe)
693                                 goto out_unlock;
694
695                         fnhe->fnhe_next = hash->chain;
696                         rcu_assign_pointer(hash->chain, fnhe);
697                 }
698                 fnhe->fnhe_genid = genid;
699                 fnhe->fnhe_daddr = daddr;
700                 fnhe->fnhe_gw = gw;
701                 fnhe->fnhe_pmtu = pmtu;
702                 fnhe->fnhe_mtu_locked = lock;
703                 fnhe->fnhe_expires = max(1UL, expires);
704
705                 /* Exception created; mark the cached routes for the nexthop
706                  * stale, so anyone caching it rechecks if this exception
707                  * applies to them.
708                  */
709                 rt = rcu_dereference(nh->nh_rth_input);
710                 if (rt)
711                         rt->dst.obsolete = DST_OBSOLETE_KILL;
712
713                 for_each_possible_cpu(i) {
714                         struct rtable __rcu **prt;
715                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
716                         rt = rcu_dereference(*prt);
717                         if (rt)
718                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
719                 }
720         }
721
722         fnhe->fnhe_stamp = jiffies;
723
724 out_unlock:
725         spin_unlock_bh(&fnhe_lock);
726 }
727
728 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
729                              bool kill_route)
730 {
731         __be32 new_gw = icmp_hdr(skb)->un.gateway;
732         __be32 old_gw = ip_hdr(skb)->saddr;
733         struct net_device *dev = skb->dev;
734         struct in_device *in_dev;
735         struct fib_result res;
736         struct neighbour *n;
737         struct net *net;
738
739         switch (icmp_hdr(skb)->code & 7) {
740         case ICMP_REDIR_NET:
741         case ICMP_REDIR_NETTOS:
742         case ICMP_REDIR_HOST:
743         case ICMP_REDIR_HOSTTOS:
744                 break;
745
746         default:
747                 return;
748         }
749
750         if (rt->rt_gateway != old_gw)
751                 return;
752
753         in_dev = __in_dev_get_rcu(dev);
754         if (!in_dev)
755                 return;
756
757         net = dev_net(dev);
758         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
759             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
760             ipv4_is_zeronet(new_gw))
761                 goto reject_redirect;
762
763         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
764                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
765                         goto reject_redirect;
766                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
767                         goto reject_redirect;
768         } else {
769                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
770                         goto reject_redirect;
771         }
772
773         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
774         if (!n)
775                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
776         if (!IS_ERR(n)) {
777                 if (!(n->nud_state & NUD_VALID)) {
778                         neigh_event_send(n, NULL);
779                 } else {
780                         if (fib_lookup(net, fl4, &res, 0) == 0) {
781                                 struct fib_nh *nh = &FIB_RES_NH(res);
782
783                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
784                                                 0, false,
785                                                 jiffies + ip_rt_gc_timeout);
786                         }
787                         if (kill_route)
788                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
789                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
790                 }
791                 neigh_release(n);
792         }
793         return;
794
795 reject_redirect:
796 #ifdef CONFIG_IP_ROUTE_VERBOSE
797         if (IN_DEV_LOG_MARTIANS(in_dev)) {
798                 const struct iphdr *iph = (const struct iphdr *) skb->data;
799                 __be32 daddr = iph->daddr;
800                 __be32 saddr = iph->saddr;
801
802                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
803                                      "  Advised path = %pI4 -> %pI4\n",
804                                      &old_gw, dev->name, &new_gw,
805                                      &saddr, &daddr);
806         }
807 #endif
808         ;
809 }
810
811 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
812 {
813         struct rtable *rt;
814         struct flowi4 fl4;
815         const struct iphdr *iph = (const struct iphdr *) skb->data;
816         struct net *net = dev_net(skb->dev);
817         int oif = skb->dev->ifindex;
818         u8 tos = RT_TOS(iph->tos);
819         u8 prot = iph->protocol;
820         u32 mark = skb->mark;
821
822         rt = (struct rtable *) dst;
823
824         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
825         __ip_do_redirect(rt, skb, &fl4, true);
826 }
827
828 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
829 {
830         struct rtable *rt = (struct rtable *)dst;
831         struct dst_entry *ret = dst;
832
833         if (rt) {
834                 if (dst->obsolete > 0) {
835                         ip_rt_put(rt);
836                         ret = NULL;
837                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
838                            rt->dst.expires) {
839                         ip_rt_put(rt);
840                         ret = NULL;
841                 }
842         }
843         return ret;
844 }
845
846 /*
847  * Algorithm:
848  *      1. The first ip_rt_redirect_number redirects are sent
849  *         with exponential backoff, then we stop sending them at all,
850  *         assuming that the host ignores our redirects.
851  *      2. If we did not see packets requiring redirects
852  *         during ip_rt_redirect_silence, we assume that the host
853  *         forgot redirected route and start to send redirects again.
854  *
855  * This algorithm is much cheaper and more intelligent than dumb load limiting
856  * in icmp.c.
857  *
858  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
859  * and "frag. need" (breaks PMTU discovery) in icmp.c.
860  */
861
862 void ip_rt_send_redirect(struct sk_buff *skb)
863 {
864         struct rtable *rt = skb_rtable(skb);
865         struct in_device *in_dev;
866         struct inet_peer *peer;
867         struct net *net;
868         int log_martians;
869         int vif;
870
871         rcu_read_lock();
872         in_dev = __in_dev_get_rcu(rt->dst.dev);
873         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
874                 rcu_read_unlock();
875                 return;
876         }
877         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
878         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
879         rcu_read_unlock();
880
881         net = dev_net(rt->dst.dev);
882         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
883         if (!peer) {
884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
885                           rt_nexthop(rt, ip_hdr(skb)->daddr));
886                 return;
887         }
888
889         /* No redirected packets during ip_rt_redirect_silence;
890          * reset the algorithm.
891          */
892         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
893                 peer->rate_tokens = 0;
894                 peer->n_redirects = 0;
895         }
896
897         /* Too many ignored redirects; do not send anything
898          * set dst.rate_last to the last seen redirected packet.
899          */
900         if (peer->n_redirects >= ip_rt_redirect_number) {
901                 peer->rate_last = jiffies;
902                 goto out_put_peer;
903         }
904
905         /* Check for load limit; set rate_last to the latest sent
906          * redirect.
907          */
908         if (peer->rate_tokens == 0 ||
909             time_after(jiffies,
910                        (peer->rate_last +
911                         (ip_rt_redirect_load << peer->rate_tokens)))) {
912                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
913
914                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
915                 peer->rate_last = jiffies;
916                 ++peer->rate_tokens;
917                 ++peer->n_redirects;
918 #ifdef CONFIG_IP_ROUTE_VERBOSE
919                 if (log_martians &&
920                     peer->rate_tokens == ip_rt_redirect_number)
921                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
922                                              &ip_hdr(skb)->saddr, inet_iif(skb),
923                                              &ip_hdr(skb)->daddr, &gw);
924 #endif
925         }
926 out_put_peer:
927         inet_putpeer(peer);
928 }
929
930 static int ip_error(struct sk_buff *skb)
931 {
932         struct rtable *rt = skb_rtable(skb);
933         struct net_device *dev = skb->dev;
934         struct in_device *in_dev;
935         struct inet_peer *peer;
936         unsigned long now;
937         struct net *net;
938         bool send;
939         int code;
940
941         if (netif_is_l3_master(skb->dev)) {
942                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
943                 if (!dev)
944                         goto out;
945         }
946
947         in_dev = __in_dev_get_rcu(dev);
948
949         /* IP on this device is disabled. */
950         if (!in_dev)
951                 goto out;
952
953         net = dev_net(rt->dst.dev);
954         if (!IN_DEV_FORWARD(in_dev)) {
955                 switch (rt->dst.error) {
956                 case EHOSTUNREACH:
957                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
958                         break;
959
960                 case ENETUNREACH:
961                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
962                         break;
963                 }
964                 goto out;
965         }
966
967         switch (rt->dst.error) {
968         case EINVAL:
969         default:
970                 goto out;
971         case EHOSTUNREACH:
972                 code = ICMP_HOST_UNREACH;
973                 break;
974         case ENETUNREACH:
975                 code = ICMP_NET_UNREACH;
976                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
977                 break;
978         case EACCES:
979                 code = ICMP_PKT_FILTERED;
980                 break;
981         }
982
983         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
984                                l3mdev_master_ifindex(skb->dev), 1);
985
986         send = true;
987         if (peer) {
988                 now = jiffies;
989                 peer->rate_tokens += now - peer->rate_last;
990                 if (peer->rate_tokens > ip_rt_error_burst)
991                         peer->rate_tokens = ip_rt_error_burst;
992                 peer->rate_last = now;
993                 if (peer->rate_tokens >= ip_rt_error_cost)
994                         peer->rate_tokens -= ip_rt_error_cost;
995                 else
996                         send = false;
997                 inet_putpeer(peer);
998         }
999         if (send)
1000                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1001
1002 out:    kfree_skb(skb);
1003         return 0;
1004 }
1005
1006 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1007 {
1008         struct dst_entry *dst = &rt->dst;
1009         u32 old_mtu = ipv4_mtu(dst);
1010         struct fib_result res;
1011         bool lock = false;
1012
1013         if (ip_mtu_locked(dst))
1014                 return;
1015
1016         if (old_mtu < mtu)
1017                 return;
1018
1019         if (mtu < ip_rt_min_pmtu) {
1020                 lock = true;
1021                 mtu = min(old_mtu, ip_rt_min_pmtu);
1022         }
1023
1024         if (rt->rt_pmtu == mtu && !lock &&
1025             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1026                 return;
1027
1028         rcu_read_lock();
1029         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1030                 struct fib_nh *nh = &FIB_RES_NH(res);
1031
1032                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1033                                       jiffies + ip_rt_mtu_expires);
1034         }
1035         rcu_read_unlock();
1036 }
1037
1038 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1039                               struct sk_buff *skb, u32 mtu)
1040 {
1041         struct rtable *rt = (struct rtable *) dst;
1042         struct flowi4 fl4;
1043
1044         ip_rt_build_flow_key(&fl4, sk, skb);
1045         __ip_rt_update_pmtu(rt, &fl4, mtu);
1046 }
1047
1048 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1049                       int oif, u8 protocol)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1055
1056         __build_flow_key(net, &fl4, NULL, iph, oif,
1057                          RT_TOS(iph->tos), protocol, mark, 0);
1058         rt = __ip_route_output_key(net, &fl4);
1059         if (!IS_ERR(rt)) {
1060                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1061                 ip_rt_put(rt);
1062         }
1063 }
1064 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1065
1066 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1067 {
1068         const struct iphdr *iph = (const struct iphdr *) skb->data;
1069         struct flowi4 fl4;
1070         struct rtable *rt;
1071
1072         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1073
1074         if (!fl4.flowi4_mark)
1075                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1076
1077         rt = __ip_route_output_key(sock_net(sk), &fl4);
1078         if (!IS_ERR(rt)) {
1079                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1080                 ip_rt_put(rt);
1081         }
1082 }
1083
1084 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1085 {
1086         const struct iphdr *iph = (const struct iphdr *) skb->data;
1087         struct flowi4 fl4;
1088         struct rtable *rt;
1089         struct dst_entry *odst = NULL;
1090         bool new = false;
1091         struct net *net = sock_net(sk);
1092
1093         bh_lock_sock(sk);
1094
1095         if (!ip_sk_accept_pmtu(sk))
1096                 goto out;
1097
1098         odst = sk_dst_get(sk);
1099
1100         if (sock_owned_by_user(sk) || !odst) {
1101                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1102                 goto out;
1103         }
1104
1105         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1106
1107         rt = (struct rtable *)odst;
1108         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1109                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1110                 if (IS_ERR(rt))
1111                         goto out;
1112
1113                 new = true;
1114         }
1115
1116         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1117
1118         if (!dst_check(&rt->dst, 0)) {
1119                 if (new)
1120                         dst_release(&rt->dst);
1121
1122                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1123                 if (IS_ERR(rt))
1124                         goto out;
1125
1126                 new = true;
1127         }
1128
1129         if (new)
1130                 sk_dst_set(sk, &rt->dst);
1131
1132 out:
1133         bh_unlock_sock(sk);
1134         dst_release(odst);
1135 }
1136 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1137
1138 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1139                    int oif, u8 protocol)
1140 {
1141         const struct iphdr *iph = (const struct iphdr *) skb->data;
1142         struct flowi4 fl4;
1143         struct rtable *rt;
1144
1145         __build_flow_key(net, &fl4, NULL, iph, oif,
1146                          RT_TOS(iph->tos), protocol, 0, 0);
1147         rt = __ip_route_output_key(net, &fl4);
1148         if (!IS_ERR(rt)) {
1149                 __ip_do_redirect(rt, skb, &fl4, false);
1150                 ip_rt_put(rt);
1151         }
1152 }
1153 EXPORT_SYMBOL_GPL(ipv4_redirect);
1154
1155 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1156 {
1157         const struct iphdr *iph = (const struct iphdr *) skb->data;
1158         struct flowi4 fl4;
1159         struct rtable *rt;
1160         struct net *net = sock_net(sk);
1161
1162         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1163         rt = __ip_route_output_key(net, &fl4);
1164         if (!IS_ERR(rt)) {
1165                 __ip_do_redirect(rt, skb, &fl4, false);
1166                 ip_rt_put(rt);
1167         }
1168 }
1169 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1170
1171 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1172 {
1173         struct rtable *rt = (struct rtable *) dst;
1174
1175         /* All IPV4 dsts are created with ->obsolete set to the value
1176          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1177          * into this function always.
1178          *
1179          * When a PMTU/redirect information update invalidates a route,
1180          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1181          * DST_OBSOLETE_DEAD.
1182          */
1183         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1184                 return NULL;
1185         return dst;
1186 }
1187
1188 static void ipv4_link_failure(struct sk_buff *skb)
1189 {
1190         struct rtable *rt;
1191
1192         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1193
1194         rt = skb_rtable(skb);
1195         if (rt)
1196                 dst_set_expires(&rt->dst, 0);
1197 }
1198
1199 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1200 {
1201         pr_debug("%s: %pI4 -> %pI4, %s\n",
1202                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1203                  skb->dev ? skb->dev->name : "?");
1204         kfree_skb(skb);
1205         WARN_ON(1);
1206         return 0;
1207 }
1208
1209 /*
1210    We do not cache source address of outgoing interface,
1211    because it is used only by IP RR, TS and SRR options,
1212    so that it out of fast path.
1213
1214    BTW remember: "addr" is allowed to be not aligned
1215    in IP options!
1216  */
1217
1218 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1219 {
1220         __be32 src;
1221
1222         if (rt_is_output_route(rt))
1223                 src = ip_hdr(skb)->saddr;
1224         else {
1225                 struct fib_result res;
1226                 struct iphdr *iph = ip_hdr(skb);
1227                 struct flowi4 fl4 = {
1228                         .daddr = iph->daddr,
1229                         .saddr = iph->saddr,
1230                         .flowi4_tos = RT_TOS(iph->tos),
1231                         .flowi4_oif = rt->dst.dev->ifindex,
1232                         .flowi4_iif = skb->dev->ifindex,
1233                         .flowi4_mark = skb->mark,
1234                 };
1235
1236                 rcu_read_lock();
1237                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1238                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1239                 else
1240                         src = inet_select_addr(rt->dst.dev,
1241                                                rt_nexthop(rt, iph->daddr),
1242                                                RT_SCOPE_UNIVERSE);
1243                 rcu_read_unlock();
1244         }
1245         memcpy(addr, &src, 4);
1246 }
1247
1248 #ifdef CONFIG_IP_ROUTE_CLASSID
1249 static void set_class_tag(struct rtable *rt, u32 tag)
1250 {
1251         if (!(rt->dst.tclassid & 0xFFFF))
1252                 rt->dst.tclassid |= tag & 0xFFFF;
1253         if (!(rt->dst.tclassid & 0xFFFF0000))
1254                 rt->dst.tclassid |= tag & 0xFFFF0000;
1255 }
1256 #endif
1257
1258 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1259 {
1260         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1261         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1262                                     ip_rt_min_advmss);
1263
1264         return min(advmss, IPV4_MAX_PMTU - header_size);
1265 }
1266
1267 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1268 {
1269         const struct rtable *rt = (const struct rtable *) dst;
1270         unsigned int mtu = rt->rt_pmtu;
1271
1272         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1273                 mtu = dst_metric_raw(dst, RTAX_MTU);
1274
1275         if (mtu)
1276                 return mtu;
1277
1278         mtu = READ_ONCE(dst->dev->mtu);
1279
1280         if (unlikely(ip_mtu_locked(dst))) {
1281                 if (rt->rt_uses_gateway && mtu > 576)
1282                         mtu = 576;
1283         }
1284
1285         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1286
1287         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1288 }
1289
1290 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1291 {
1292         struct fnhe_hash_bucket *hash;
1293         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1294         u32 hval = fnhe_hashfun(daddr);
1295
1296         spin_lock_bh(&fnhe_lock);
1297
1298         hash = rcu_dereference_protected(nh->nh_exceptions,
1299                                          lockdep_is_held(&fnhe_lock));
1300         hash += hval;
1301
1302         fnhe_p = &hash->chain;
1303         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1304         while (fnhe) {
1305                 if (fnhe->fnhe_daddr == daddr) {
1306                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1307                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1308                         /* set fnhe_daddr to 0 to ensure it won't bind with
1309                          * new dsts in rt_bind_exception().
1310                          */
1311                         fnhe->fnhe_daddr = 0;
1312                         fnhe_flush_routes(fnhe);
1313                         kfree_rcu(fnhe, rcu);
1314                         break;
1315                 }
1316                 fnhe_p = &fnhe->fnhe_next;
1317                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1318                                                  lockdep_is_held(&fnhe_lock));
1319         }
1320
1321         spin_unlock_bh(&fnhe_lock);
1322 }
1323
1324 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1325 {
1326         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1327         struct fib_nh_exception *fnhe;
1328         u32 hval;
1329
1330         if (!hash)
1331                 return NULL;
1332
1333         hval = fnhe_hashfun(daddr);
1334
1335         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1336              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1337                 if (fnhe->fnhe_daddr == daddr) {
1338                         if (fnhe->fnhe_expires &&
1339                             time_after(jiffies, fnhe->fnhe_expires)) {
1340                                 ip_del_fnhe(nh, daddr);
1341                                 break;
1342                         }
1343                         return fnhe;
1344                 }
1345         }
1346         return NULL;
1347 }
1348
1349 /* MTU selection:
1350  * 1. mtu on route is locked - use it
1351  * 2. mtu from nexthop exception
1352  * 3. mtu from egress device
1353  */
1354
1355 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1356 {
1357         struct fib_info *fi = res->fi;
1358         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1359         struct net_device *dev = nh->nh_dev;
1360         u32 mtu = 0;
1361
1362         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1363             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1364                 mtu = fi->fib_mtu;
1365
1366         if (likely(!mtu)) {
1367                 struct fib_nh_exception *fnhe;
1368
1369                 fnhe = find_exception(nh, daddr);
1370                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1371                         mtu = fnhe->fnhe_pmtu;
1372         }
1373
1374         if (likely(!mtu))
1375                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1376
1377         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1378 }
1379
1380 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1381                               __be32 daddr, const bool do_cache)
1382 {
1383         bool ret = false;
1384
1385         spin_lock_bh(&fnhe_lock);
1386
1387         if (daddr == fnhe->fnhe_daddr) {
1388                 struct rtable __rcu **porig;
1389                 struct rtable *orig;
1390                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1391
1392                 if (rt_is_input_route(rt))
1393                         porig = &fnhe->fnhe_rth_input;
1394                 else
1395                         porig = &fnhe->fnhe_rth_output;
1396                 orig = rcu_dereference(*porig);
1397
1398                 if (fnhe->fnhe_genid != genid) {
1399                         fnhe->fnhe_genid = genid;
1400                         fnhe->fnhe_gw = 0;
1401                         fnhe->fnhe_pmtu = 0;
1402                         fnhe->fnhe_expires = 0;
1403                         fnhe->fnhe_mtu_locked = false;
1404                         fnhe_flush_routes(fnhe);
1405                         orig = NULL;
1406                 }
1407                 fill_route_from_fnhe(rt, fnhe);
1408                 if (!rt->rt_gateway)
1409                         rt->rt_gateway = daddr;
1410
1411                 if (do_cache) {
1412                         dst_hold(&rt->dst);
1413                         rcu_assign_pointer(*porig, rt);
1414                         if (orig) {
1415                                 dst_dev_put(&orig->dst);
1416                                 dst_release(&orig->dst);
1417                         }
1418                         ret = true;
1419                 }
1420
1421                 fnhe->fnhe_stamp = jiffies;
1422         }
1423         spin_unlock_bh(&fnhe_lock);
1424
1425         return ret;
1426 }
1427
1428 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1429 {
1430         struct rtable *orig, *prev, **p;
1431         bool ret = true;
1432
1433         if (rt_is_input_route(rt)) {
1434                 p = (struct rtable **)&nh->nh_rth_input;
1435         } else {
1436                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1437         }
1438         orig = *p;
1439
1440         /* hold dst before doing cmpxchg() to avoid race condition
1441          * on this dst
1442          */
1443         dst_hold(&rt->dst);
1444         prev = cmpxchg(p, orig, rt);
1445         if (prev == orig) {
1446                 if (orig) {
1447                         dst_dev_put(&orig->dst);
1448                         dst_release(&orig->dst);
1449                 }
1450         } else {
1451                 dst_release(&rt->dst);
1452                 ret = false;
1453         }
1454
1455         return ret;
1456 }
1457
1458 struct uncached_list {
1459         spinlock_t              lock;
1460         struct list_head        head;
1461 };
1462
1463 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1464
1465 void rt_add_uncached_list(struct rtable *rt)
1466 {
1467         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1468
1469         rt->rt_uncached_list = ul;
1470
1471         spin_lock_bh(&ul->lock);
1472         list_add_tail(&rt->rt_uncached, &ul->head);
1473         spin_unlock_bh(&ul->lock);
1474 }
1475
1476 void rt_del_uncached_list(struct rtable *rt)
1477 {
1478         if (!list_empty(&rt->rt_uncached)) {
1479                 struct uncached_list *ul = rt->rt_uncached_list;
1480
1481                 spin_lock_bh(&ul->lock);
1482                 list_del(&rt->rt_uncached);
1483                 spin_unlock_bh(&ul->lock);
1484         }
1485 }
1486
1487 static void ipv4_dst_destroy(struct dst_entry *dst)
1488 {
1489         struct rtable *rt = (struct rtable *)dst;
1490
1491         ip_dst_metrics_put(dst);
1492         rt_del_uncached_list(rt);
1493 }
1494
1495 void rt_flush_dev(struct net_device *dev)
1496 {
1497         struct net *net = dev_net(dev);
1498         struct rtable *rt;
1499         int cpu;
1500
1501         for_each_possible_cpu(cpu) {
1502                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1503
1504                 spin_lock_bh(&ul->lock);
1505                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1506                         if (rt->dst.dev != dev)
1507                                 continue;
1508                         rt->dst.dev = net->loopback_dev;
1509                         dev_hold(rt->dst.dev);
1510                         dev_put(dev);
1511                 }
1512                 spin_unlock_bh(&ul->lock);
1513         }
1514 }
1515
1516 static bool rt_cache_valid(const struct rtable *rt)
1517 {
1518         return  rt &&
1519                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1520                 !rt_is_expired(rt);
1521 }
1522
1523 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1524                            const struct fib_result *res,
1525                            struct fib_nh_exception *fnhe,
1526                            struct fib_info *fi, u16 type, u32 itag,
1527                            const bool do_cache)
1528 {
1529         bool cached = false;
1530
1531         if (fi) {
1532                 struct fib_nh *nh = &FIB_RES_NH(*res);
1533
1534                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1535                         rt->rt_gateway = nh->nh_gw;
1536                         rt->rt_uses_gateway = 1;
1537                 }
1538                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1539
1540 #ifdef CONFIG_IP_ROUTE_CLASSID
1541                 rt->dst.tclassid = nh->nh_tclassid;
1542 #endif
1543                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1544                 if (unlikely(fnhe))
1545                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1546                 else if (do_cache)
1547                         cached = rt_cache_route(nh, rt);
1548                 if (unlikely(!cached)) {
1549                         /* Routes we intend to cache in nexthop exception or
1550                          * FIB nexthop have the DST_NOCACHE bit clear.
1551                          * However, if we are unsuccessful at storing this
1552                          * route into the cache we really need to set it.
1553                          */
1554                         if (!rt->rt_gateway)
1555                                 rt->rt_gateway = daddr;
1556                         rt_add_uncached_list(rt);
1557                 }
1558         } else
1559                 rt_add_uncached_list(rt);
1560
1561 #ifdef CONFIG_IP_ROUTE_CLASSID
1562 #ifdef CONFIG_IP_MULTIPLE_TABLES
1563         set_class_tag(rt, res->tclassid);
1564 #endif
1565         set_class_tag(rt, itag);
1566 #endif
1567 }
1568
1569 struct rtable *rt_dst_alloc(struct net_device *dev,
1570                             unsigned int flags, u16 type,
1571                             bool nopolicy, bool noxfrm, bool will_cache)
1572 {
1573         struct rtable *rt;
1574
1575         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1576                        (will_cache ? 0 : DST_HOST) |
1577                        (nopolicy ? DST_NOPOLICY : 0) |
1578                        (noxfrm ? DST_NOXFRM : 0));
1579
1580         if (rt) {
1581                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1582                 rt->rt_flags = flags;
1583                 rt->rt_type = type;
1584                 rt->rt_is_input = 0;
1585                 rt->rt_iif = 0;
1586                 rt->rt_pmtu = 0;
1587                 rt->rt_mtu_locked = 0;
1588                 rt->rt_gateway = 0;
1589                 rt->rt_uses_gateway = 0;
1590                 INIT_LIST_HEAD(&rt->rt_uncached);
1591
1592                 rt->dst.output = ip_output;
1593                 if (flags & RTCF_LOCAL)
1594                         rt->dst.input = ip_local_deliver;
1595         }
1596
1597         return rt;
1598 }
1599 EXPORT_SYMBOL(rt_dst_alloc);
1600
1601 /* called in rcu_read_lock() section */
1602 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1603                           u8 tos, struct net_device *dev,
1604                           struct in_device *in_dev, u32 *itag)
1605 {
1606         int err;
1607
1608         /* Primary sanity checks. */
1609         if (!in_dev)
1610                 return -EINVAL;
1611
1612         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1613             skb->protocol != htons(ETH_P_IP))
1614                 return -EINVAL;
1615
1616         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1617                 return -EINVAL;
1618
1619         if (ipv4_is_zeronet(saddr)) {
1620                 if (!ipv4_is_local_multicast(daddr) &&
1621                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1622                         return -EINVAL;
1623         } else {
1624                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625                                           in_dev, itag);
1626                 if (err < 0)
1627                         return err;
1628         }
1629         return 0;
1630 }
1631
1632 /* called in rcu_read_lock() section */
1633 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1634                              u8 tos, struct net_device *dev, int our)
1635 {
1636         struct in_device *in_dev = __in_dev_get_rcu(dev);
1637         unsigned int flags = RTCF_MULTICAST;
1638         struct rtable *rth;
1639         u32 itag = 0;
1640         int err;
1641
1642         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1643         if (err)
1644                 return err;
1645
1646         if (our)
1647                 flags |= RTCF_LOCAL;
1648
1649         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1650                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1651         if (!rth)
1652                 return -ENOBUFS;
1653
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655         rth->dst.tclassid = itag;
1656 #endif
1657         rth->dst.output = ip_rt_bug;
1658         rth->rt_is_input= 1;
1659
1660 #ifdef CONFIG_IP_MROUTE
1661         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1662                 rth->dst.input = ip_mr_input;
1663 #endif
1664         RT_CACHE_STAT_INC(in_slow_mc);
1665
1666         skb_dst_set(skb, &rth->dst);
1667         return 0;
1668 }
1669
1670
1671 static void ip_handle_martian_source(struct net_device *dev,
1672                                      struct in_device *in_dev,
1673                                      struct sk_buff *skb,
1674                                      __be32 daddr,
1675                                      __be32 saddr)
1676 {
1677         RT_CACHE_STAT_INC(in_martian_src);
1678 #ifdef CONFIG_IP_ROUTE_VERBOSE
1679         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1680                 /*
1681                  *      RFC1812 recommendation, if source is martian,
1682                  *      the only hint is MAC header.
1683                  */
1684                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1685                         &daddr, &saddr, dev->name);
1686                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1687                         print_hex_dump(KERN_WARNING, "ll header: ",
1688                                        DUMP_PREFIX_OFFSET, 16, 1,
1689                                        skb_mac_header(skb),
1690                                        dev->hard_header_len, false);
1691                 }
1692         }
1693 #endif
1694 }
1695
1696 /* called in rcu_read_lock() section */
1697 static int __mkroute_input(struct sk_buff *skb,
1698                            const struct fib_result *res,
1699                            struct in_device *in_dev,
1700                            __be32 daddr, __be32 saddr, u32 tos)
1701 {
1702         struct fib_nh_exception *fnhe;
1703         struct rtable *rth;
1704         int err;
1705         struct in_device *out_dev;
1706         bool do_cache;
1707         u32 itag = 0;
1708
1709         /* get a working reference to the output device */
1710         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1711         if (!out_dev) {
1712                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1713                 return -EINVAL;
1714         }
1715
1716         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1717                                   in_dev->dev, in_dev, &itag);
1718         if (err < 0) {
1719                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1720                                          saddr);
1721
1722                 goto cleanup;
1723         }
1724
1725         do_cache = res->fi && !itag;
1726         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1727             skb->protocol == htons(ETH_P_IP) &&
1728             (IN_DEV_SHARED_MEDIA(out_dev) ||
1729              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1730                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1731
1732         if (skb->protocol != htons(ETH_P_IP)) {
1733                 /* Not IP (i.e. ARP). Do not create route, if it is
1734                  * invalid for proxy arp. DNAT routes are always valid.
1735                  *
1736                  * Proxy arp feature have been extended to allow, ARP
1737                  * replies back to the same interface, to support
1738                  * Private VLAN switch technologies. See arp.c.
1739                  */
1740                 if (out_dev == in_dev &&
1741                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1742                         err = -EINVAL;
1743                         goto cleanup;
1744                 }
1745         }
1746
1747         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1748         if (do_cache) {
1749                 if (fnhe)
1750                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1751                 else
1752                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1753                 if (rt_cache_valid(rth)) {
1754                         skb_dst_set_noref(skb, &rth->dst);
1755                         goto out;
1756                 }
1757         }
1758
1759         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1760                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1761                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1762         if (!rth) {
1763                 err = -ENOBUFS;
1764                 goto cleanup;
1765         }
1766
1767         rth->rt_is_input = 1;
1768         RT_CACHE_STAT_INC(in_slow_tot);
1769
1770         rth->dst.input = ip_forward;
1771
1772         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1773                        do_cache);
1774         lwtunnel_set_redirect(&rth->dst);
1775         skb_dst_set(skb, &rth->dst);
1776 out:
1777         err = 0;
1778  cleanup:
1779         return err;
1780 }
1781
1782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1783 /* To make ICMP packets follow the right flow, the multipath hash is
1784  * calculated from the inner IP addresses.
1785  */
1786 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1787                                  struct flow_keys *hash_keys)
1788 {
1789         const struct iphdr *outer_iph = ip_hdr(skb);
1790         const struct iphdr *key_iph = outer_iph;
1791         const struct iphdr *inner_iph;
1792         const struct icmphdr *icmph;
1793         struct iphdr _inner_iph;
1794         struct icmphdr _icmph;
1795
1796         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1797                 goto out;
1798
1799         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1800                 goto out;
1801
1802         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1803                                    &_icmph);
1804         if (!icmph)
1805                 goto out;
1806
1807         if (icmph->type != ICMP_DEST_UNREACH &&
1808             icmph->type != ICMP_REDIRECT &&
1809             icmph->type != ICMP_TIME_EXCEEDED &&
1810             icmph->type != ICMP_PARAMETERPROB)
1811                 goto out;
1812
1813         inner_iph = skb_header_pointer(skb,
1814                                        outer_iph->ihl * 4 + sizeof(_icmph),
1815                                        sizeof(_inner_iph), &_inner_iph);
1816         if (!inner_iph)
1817                 goto out;
1818
1819         key_iph = inner_iph;
1820 out:
1821         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1822         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1823 }
1824
1825 /* if skb is set it will be used and fl4 can be NULL */
1826 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1827                        const struct sk_buff *skb, struct flow_keys *flkeys)
1828 {
1829         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1830         struct flow_keys hash_keys;
1831         u32 mhash;
1832
1833         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1834         case 0:
1835                 memset(&hash_keys, 0, sizeof(hash_keys));
1836                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1837                 if (skb) {
1838                         ip_multipath_l3_keys(skb, &hash_keys);
1839                 } else {
1840                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1841                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1842                 }
1843                 break;
1844         case 1:
1845                 /* skb is currently provided only when forwarding */
1846                 if (skb) {
1847                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1848                         struct flow_keys keys;
1849
1850                         /* short-circuit if we already have L4 hash present */
1851                         if (skb->l4_hash)
1852                                 return skb_get_hash_raw(skb) >> 1;
1853
1854                         memset(&hash_keys, 0, sizeof(hash_keys));
1855
1856                         if (!flkeys) {
1857                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1858                                 flkeys = &keys;
1859                         }
1860
1861                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1862                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1863                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1864                         hash_keys.ports.src = flkeys->ports.src;
1865                         hash_keys.ports.dst = flkeys->ports.dst;
1866                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1867                 } else {
1868                         memset(&hash_keys, 0, sizeof(hash_keys));
1869                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1870                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1871                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1872                         hash_keys.ports.src = fl4->fl4_sport;
1873                         hash_keys.ports.dst = fl4->fl4_dport;
1874                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1875                 }
1876                 break;
1877         }
1878         mhash = flow_hash_from_keys(&hash_keys);
1879
1880         if (multipath_hash)
1881                 mhash = jhash_2words(mhash, multipath_hash, 0);
1882
1883         return mhash >> 1;
1884 }
1885 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1886
1887 static int ip_mkroute_input(struct sk_buff *skb,
1888                             struct fib_result *res,
1889                             struct in_device *in_dev,
1890                             __be32 daddr, __be32 saddr, u32 tos,
1891                             struct flow_keys *hkeys)
1892 {
1893 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1894         if (res->fi && res->fi->fib_nhs > 1) {
1895                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1896
1897                 fib_select_multipath(res, h);
1898         }
1899 #endif
1900
1901         /* create a routing cache entry */
1902         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1903 }
1904
1905 /*
1906  *      NOTE. We drop all the packets that has local source
1907  *      addresses, because every properly looped back packet
1908  *      must have correct destination already attached by output routine.
1909  *
1910  *      Such approach solves two big problems:
1911  *      1. Not simplex devices are handled properly.
1912  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1913  *      called with rcu_read_lock()
1914  */
1915
1916 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1917                                u8 tos, struct net_device *dev,
1918                                struct fib_result *res)
1919 {
1920         struct in_device *in_dev = __in_dev_get_rcu(dev);
1921         struct flow_keys *flkeys = NULL, _flkeys;
1922         struct net    *net = dev_net(dev);
1923         struct ip_tunnel_info *tun_info;
1924         int             err = -EINVAL;
1925         unsigned int    flags = 0;
1926         u32             itag = 0;
1927         struct rtable   *rth;
1928         struct flowi4   fl4;
1929         bool do_cache;
1930
1931         /* IP on this device is disabled. */
1932
1933         if (!in_dev)
1934                 goto out;
1935
1936         /* Check for the most weird martians, which can be not detected
1937            by fib_lookup.
1938          */
1939
1940         tun_info = skb_tunnel_info(skb);
1941         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1942                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1943         else
1944                 fl4.flowi4_tun_key.tun_id = 0;
1945         skb_dst_drop(skb);
1946
1947         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1948                 goto martian_source;
1949
1950         res->fi = NULL;
1951         res->table = NULL;
1952         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1953                 goto brd_input;
1954
1955         /* Accept zero addresses only to limited broadcast;
1956          * I even do not know to fix it or not. Waiting for complains :-)
1957          */
1958         if (ipv4_is_zeronet(saddr))
1959                 goto martian_source;
1960
1961         if (ipv4_is_zeronet(daddr))
1962                 goto martian_destination;
1963
1964         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1965          * and call it once if daddr or/and saddr are loopback addresses
1966          */
1967         if (ipv4_is_loopback(daddr)) {
1968                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1969                         goto martian_destination;
1970         } else if (ipv4_is_loopback(saddr)) {
1971                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1972                         goto martian_source;
1973         }
1974
1975         /*
1976          *      Now we are ready to route packet.
1977          */
1978         fl4.flowi4_oif = 0;
1979         fl4.flowi4_iif = dev->ifindex;
1980         fl4.flowi4_mark = skb->mark;
1981         fl4.flowi4_tos = tos;
1982         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1983         fl4.flowi4_flags = 0;
1984         fl4.daddr = daddr;
1985         fl4.saddr = saddr;
1986         fl4.flowi4_uid = sock_net_uid(net, NULL);
1987
1988         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1989                 flkeys = &_flkeys;
1990         } else {
1991                 fl4.flowi4_proto = 0;
1992                 fl4.fl4_sport = 0;
1993                 fl4.fl4_dport = 0;
1994         }
1995
1996         err = fib_lookup(net, &fl4, res, 0);
1997         if (err != 0) {
1998                 if (!IN_DEV_FORWARD(in_dev))
1999                         err = -EHOSTUNREACH;
2000                 goto no_route;
2001         }
2002
2003         if (res->type == RTN_BROADCAST) {
2004                 if (IN_DEV_BFORWARD(in_dev))
2005                         goto make_route;
2006                 goto brd_input;
2007         }
2008
2009         if (res->type == RTN_LOCAL) {
2010                 err = fib_validate_source(skb, saddr, daddr, tos,
2011                                           0, dev, in_dev, &itag);
2012                 if (err < 0)
2013                         goto martian_source;
2014                 goto local_input;
2015         }
2016
2017         if (!IN_DEV_FORWARD(in_dev)) {
2018                 err = -EHOSTUNREACH;
2019                 goto no_route;
2020         }
2021         if (res->type != RTN_UNICAST)
2022                 goto martian_destination;
2023
2024 make_route:
2025         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2026 out:    return err;
2027
2028 brd_input:
2029         if (skb->protocol != htons(ETH_P_IP))
2030                 goto e_inval;
2031
2032         if (!ipv4_is_zeronet(saddr)) {
2033                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2034                                           in_dev, &itag);
2035                 if (err < 0)
2036                         goto martian_source;
2037         }
2038         flags |= RTCF_BROADCAST;
2039         res->type = RTN_BROADCAST;
2040         RT_CACHE_STAT_INC(in_brd);
2041
2042 local_input:
2043         do_cache = false;
2044         if (res->fi) {
2045                 if (!itag) {
2046                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2047                         if (rt_cache_valid(rth)) {
2048                                 skb_dst_set_noref(skb, &rth->dst);
2049                                 err = 0;
2050                                 goto out;
2051                         }
2052                         do_cache = true;
2053                 }
2054         }
2055
2056         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2057                            flags | RTCF_LOCAL, res->type,
2058                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2059         if (!rth)
2060                 goto e_nobufs;
2061
2062         rth->dst.output= ip_rt_bug;
2063 #ifdef CONFIG_IP_ROUTE_CLASSID
2064         rth->dst.tclassid = itag;
2065 #endif
2066         rth->rt_is_input = 1;
2067
2068         RT_CACHE_STAT_INC(in_slow_tot);
2069         if (res->type == RTN_UNREACHABLE) {
2070                 rth->dst.input= ip_error;
2071                 rth->dst.error= -err;
2072                 rth->rt_flags   &= ~RTCF_LOCAL;
2073         }
2074
2075         if (do_cache) {
2076                 struct fib_nh *nh = &FIB_RES_NH(*res);
2077
2078                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2079                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2080                         WARN_ON(rth->dst.input == lwtunnel_input);
2081                         rth->dst.lwtstate->orig_input = rth->dst.input;
2082                         rth->dst.input = lwtunnel_input;
2083                 }
2084
2085                 if (unlikely(!rt_cache_route(nh, rth)))
2086                         rt_add_uncached_list(rth);
2087         }
2088         skb_dst_set(skb, &rth->dst);
2089         err = 0;
2090         goto out;
2091
2092 no_route:
2093         RT_CACHE_STAT_INC(in_no_route);
2094         res->type = RTN_UNREACHABLE;
2095         res->fi = NULL;
2096         res->table = NULL;
2097         goto local_input;
2098
2099         /*
2100          *      Do not cache martian addresses: they should be logged (RFC1812)
2101          */
2102 martian_destination:
2103         RT_CACHE_STAT_INC(in_martian_dst);
2104 #ifdef CONFIG_IP_ROUTE_VERBOSE
2105         if (IN_DEV_LOG_MARTIANS(in_dev))
2106                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2107                                      &daddr, &saddr, dev->name);
2108 #endif
2109
2110 e_inval:
2111         err = -EINVAL;
2112         goto out;
2113
2114 e_nobufs:
2115         err = -ENOBUFS;
2116         goto out;
2117
2118 martian_source:
2119         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2120         goto out;
2121 }
2122
2123 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124                          u8 tos, struct net_device *dev)
2125 {
2126         struct fib_result res;
2127         int err;
2128
2129         tos &= IPTOS_RT_MASK;
2130         rcu_read_lock();
2131         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2132         rcu_read_unlock();
2133
2134         return err;
2135 }
2136 EXPORT_SYMBOL(ip_route_input_noref);
2137
2138 /* called with rcu_read_lock held */
2139 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2140                        u8 tos, struct net_device *dev, struct fib_result *res)
2141 {
2142         /* Multicast recognition logic is moved from route cache to here.
2143            The problem was that too many Ethernet cards have broken/missing
2144            hardware multicast filters :-( As result the host on multicasting
2145            network acquires a lot of useless route cache entries, sort of
2146            SDR messages from all the world. Now we try to get rid of them.
2147            Really, provided software IP multicast filter is organized
2148            reasonably (at least, hashed), it does not result in a slowdown
2149            comparing with route cache reject entries.
2150            Note, that multicast routers are not affected, because
2151            route cache entry is created eventually.
2152          */
2153         if (ipv4_is_multicast(daddr)) {
2154                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2155                 int our = 0;
2156                 int err = -EINVAL;
2157
2158                 if (!in_dev)
2159                         return err;
2160                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2161                                       ip_hdr(skb)->protocol);
2162
2163                 /* check l3 master if no match yet */
2164                 if (!our && netif_is_l3_slave(dev)) {
2165                         struct in_device *l3_in_dev;
2166
2167                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2168                         if (l3_in_dev)
2169                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2170                                                       ip_hdr(skb)->protocol);
2171                 }
2172
2173                 if (our
2174 #ifdef CONFIG_IP_MROUTE
2175                         ||
2176                     (!ipv4_is_local_multicast(daddr) &&
2177                      IN_DEV_MFORWARD(in_dev))
2178 #endif
2179                    ) {
2180                         err = ip_route_input_mc(skb, daddr, saddr,
2181                                                 tos, dev, our);
2182                 }
2183                 return err;
2184         }
2185
2186         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2187 }
2188
2189 /* called with rcu_read_lock() */
2190 static struct rtable *__mkroute_output(const struct fib_result *res,
2191                                        const struct flowi4 *fl4, int orig_oif,
2192                                        struct net_device *dev_out,
2193                                        unsigned int flags)
2194 {
2195         struct fib_info *fi = res->fi;
2196         struct fib_nh_exception *fnhe;
2197         struct in_device *in_dev;
2198         u16 type = res->type;
2199         struct rtable *rth;
2200         bool do_cache;
2201
2202         in_dev = __in_dev_get_rcu(dev_out);
2203         if (!in_dev)
2204                 return ERR_PTR(-EINVAL);
2205
2206         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2207                 if (ipv4_is_loopback(fl4->saddr) &&
2208                     !(dev_out->flags & IFF_LOOPBACK) &&
2209                     !netif_is_l3_master(dev_out))
2210                         return ERR_PTR(-EINVAL);
2211
2212         if (ipv4_is_lbcast(fl4->daddr))
2213                 type = RTN_BROADCAST;
2214         else if (ipv4_is_multicast(fl4->daddr))
2215                 type = RTN_MULTICAST;
2216         else if (ipv4_is_zeronet(fl4->daddr))
2217                 return ERR_PTR(-EINVAL);
2218
2219         if (dev_out->flags & IFF_LOOPBACK)
2220                 flags |= RTCF_LOCAL;
2221
2222         do_cache = true;
2223         if (type == RTN_BROADCAST) {
2224                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2225                 fi = NULL;
2226         } else if (type == RTN_MULTICAST) {
2227                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2228                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2229                                      fl4->flowi4_proto))
2230                         flags &= ~RTCF_LOCAL;
2231                 else
2232                         do_cache = false;
2233                 /* If multicast route do not exist use
2234                  * default one, but do not gateway in this case.
2235                  * Yes, it is hack.
2236                  */
2237                 if (fi && res->prefixlen < 4)
2238                         fi = NULL;
2239         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2240                    (orig_oif != dev_out->ifindex)) {
2241                 /* For local routes that require a particular output interface
2242                  * we do not want to cache the result.  Caching the result
2243                  * causes incorrect behaviour when there are multiple source
2244                  * addresses on the interface, the end result being that if the
2245                  * intended recipient is waiting on that interface for the
2246                  * packet he won't receive it because it will be delivered on
2247                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2248                  * be set to the loopback interface as well.
2249                  */
2250                 do_cache = false;
2251         }
2252
2253         fnhe = NULL;
2254         do_cache &= fi != NULL;
2255         if (fi) {
2256                 struct rtable __rcu **prth;
2257                 struct fib_nh *nh = &FIB_RES_NH(*res);
2258
2259                 fnhe = find_exception(nh, fl4->daddr);
2260                 if (!do_cache)
2261                         goto add;
2262                 if (fnhe) {
2263                         prth = &fnhe->fnhe_rth_output;
2264                 } else {
2265                         if (unlikely(fl4->flowi4_flags &
2266                                      FLOWI_FLAG_KNOWN_NH &&
2267                                      !(nh->nh_gw &&
2268                                        nh->nh_scope == RT_SCOPE_LINK))) {
2269                                 do_cache = false;
2270                                 goto add;
2271                         }
2272                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2273                 }
2274                 rth = rcu_dereference(*prth);
2275                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2276                         return rth;
2277         }
2278
2279 add:
2280         rth = rt_dst_alloc(dev_out, flags, type,
2281                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2282                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2283                            do_cache);
2284         if (!rth)
2285                 return ERR_PTR(-ENOBUFS);
2286
2287         rth->rt_iif = orig_oif;
2288
2289         RT_CACHE_STAT_INC(out_slow_tot);
2290
2291         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2292                 if (flags & RTCF_LOCAL &&
2293                     !(dev_out->flags & IFF_LOOPBACK)) {
2294                         rth->dst.output = ip_mc_output;
2295                         RT_CACHE_STAT_INC(out_slow_mc);
2296                 }
2297 #ifdef CONFIG_IP_MROUTE
2298                 if (type == RTN_MULTICAST) {
2299                         if (IN_DEV_MFORWARD(in_dev) &&
2300                             !ipv4_is_local_multicast(fl4->daddr)) {
2301                                 rth->dst.input = ip_mr_input;
2302                                 rth->dst.output = ip_mc_output;
2303                         }
2304                 }
2305 #endif
2306         }
2307
2308         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2309         lwtunnel_set_redirect(&rth->dst);
2310
2311         return rth;
2312 }
2313
2314 /*
2315  * Major route resolver routine.
2316  */
2317
2318 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2319                                         const struct sk_buff *skb)
2320 {
2321         __u8 tos = RT_FL_TOS(fl4);
2322         struct fib_result res = {
2323                 .type           = RTN_UNSPEC,
2324                 .fi             = NULL,
2325                 .table          = NULL,
2326                 .tclassid       = 0,
2327         };
2328         struct rtable *rth;
2329
2330         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2331         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2332         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2333                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2334
2335         rcu_read_lock();
2336         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2337         rcu_read_unlock();
2338
2339         return rth;
2340 }
2341 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2342
2343 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2344                                             struct fib_result *res,
2345                                             const struct sk_buff *skb)
2346 {
2347         struct net_device *dev_out = NULL;
2348         int orig_oif = fl4->flowi4_oif;
2349         unsigned int flags = 0;
2350         struct rtable *rth;
2351         int err = -ENETUNREACH;
2352
2353         if (fl4->saddr) {
2354                 rth = ERR_PTR(-EINVAL);
2355                 if (ipv4_is_multicast(fl4->saddr) ||
2356                     ipv4_is_lbcast(fl4->saddr) ||
2357                     ipv4_is_zeronet(fl4->saddr))
2358                         goto out;
2359
2360                 /* I removed check for oif == dev_out->oif here.
2361                    It was wrong for two reasons:
2362                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2363                       is assigned to multiple interfaces.
2364                    2. Moreover, we are allowed to send packets with saddr
2365                       of another iface. --ANK
2366                  */
2367
2368                 if (fl4->flowi4_oif == 0 &&
2369                     (ipv4_is_multicast(fl4->daddr) ||
2370                      ipv4_is_lbcast(fl4->daddr))) {
2371                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2372                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2373                         if (!dev_out)
2374                                 goto out;
2375
2376                         /* Special hack: user can direct multicasts
2377                            and limited broadcast via necessary interface
2378                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2379                            This hack is not just for fun, it allows
2380                            vic,vat and friends to work.
2381                            They bind socket to loopback, set ttl to zero
2382                            and expect that it will work.
2383                            From the viewpoint of routing cache they are broken,
2384                            because we are not allowed to build multicast path
2385                            with loopback source addr (look, routing cache
2386                            cannot know, that ttl is zero, so that packet
2387                            will not leave this host and route is valid).
2388                            Luckily, this hack is good workaround.
2389                          */
2390
2391                         fl4->flowi4_oif = dev_out->ifindex;
2392                         goto make_route;
2393                 }
2394
2395                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2396                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2397                         if (!__ip_dev_find(net, fl4->saddr, false))
2398                                 goto out;
2399                 }
2400         }
2401
2402
2403         if (fl4->flowi4_oif) {
2404                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2405                 rth = ERR_PTR(-ENODEV);
2406                 if (!dev_out)
2407                         goto out;
2408
2409                 /* RACE: Check return value of inet_select_addr instead. */
2410                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2411                         rth = ERR_PTR(-ENETUNREACH);
2412                         goto out;
2413                 }
2414                 if (ipv4_is_local_multicast(fl4->daddr) ||
2415                     ipv4_is_lbcast(fl4->daddr) ||
2416                     fl4->flowi4_proto == IPPROTO_IGMP) {
2417                         if (!fl4->saddr)
2418                                 fl4->saddr = inet_select_addr(dev_out, 0,
2419                                                               RT_SCOPE_LINK);
2420                         goto make_route;
2421                 }
2422                 if (!fl4->saddr) {
2423                         if (ipv4_is_multicast(fl4->daddr))
2424                                 fl4->saddr = inet_select_addr(dev_out, 0,
2425                                                               fl4->flowi4_scope);
2426                         else if (!fl4->daddr)
2427                                 fl4->saddr = inet_select_addr(dev_out, 0,
2428                                                               RT_SCOPE_HOST);
2429                 }
2430         }
2431
2432         if (!fl4->daddr) {
2433                 fl4->daddr = fl4->saddr;
2434                 if (!fl4->daddr)
2435                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2436                 dev_out = net->loopback_dev;
2437                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2438                 res->type = RTN_LOCAL;
2439                 flags |= RTCF_LOCAL;
2440                 goto make_route;
2441         }
2442
2443         err = fib_lookup(net, fl4, res, 0);
2444         if (err) {
2445                 res->fi = NULL;
2446                 res->table = NULL;
2447                 if (fl4->flowi4_oif &&
2448                     (ipv4_is_multicast(fl4->daddr) ||
2449                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2450                         /* Apparently, routing tables are wrong. Assume,
2451                            that the destination is on link.
2452
2453                            WHY? DW.
2454                            Because we are allowed to send to iface
2455                            even if it has NO routes and NO assigned
2456                            addresses. When oif is specified, routing
2457                            tables are looked up with only one purpose:
2458                            to catch if destination is gatewayed, rather than
2459                            direct. Moreover, if MSG_DONTROUTE is set,
2460                            we send packet, ignoring both routing tables
2461                            and ifaddr state. --ANK
2462
2463
2464                            We could make it even if oif is unknown,
2465                            likely IPv6, but we do not.
2466                          */
2467
2468                         if (fl4->saddr == 0)
2469                                 fl4->saddr = inet_select_addr(dev_out, 0,
2470                                                               RT_SCOPE_LINK);
2471                         res->type = RTN_UNICAST;
2472                         goto make_route;
2473                 }
2474                 rth = ERR_PTR(err);
2475                 goto out;
2476         }
2477
2478         if (res->type == RTN_LOCAL) {
2479                 if (!fl4->saddr) {
2480                         if (res->fi->fib_prefsrc)
2481                                 fl4->saddr = res->fi->fib_prefsrc;
2482                         else
2483                                 fl4->saddr = fl4->daddr;
2484                 }
2485
2486                 /* L3 master device is the loopback for that domain */
2487                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2488                         net->loopback_dev;
2489
2490                 /* make sure orig_oif points to fib result device even
2491                  * though packet rx/tx happens over loopback or l3mdev
2492                  */
2493                 orig_oif = FIB_RES_OIF(*res);
2494
2495                 fl4->flowi4_oif = dev_out->ifindex;
2496                 flags |= RTCF_LOCAL;
2497                 goto make_route;
2498         }
2499
2500         fib_select_path(net, res, fl4, skb);
2501
2502         dev_out = FIB_RES_DEV(*res);
2503         fl4->flowi4_oif = dev_out->ifindex;
2504
2505
2506 make_route:
2507         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2508
2509 out:
2510         return rth;
2511 }
2512
2513 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2514 {
2515         return NULL;
2516 }
2517
2518 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2519 {
2520         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2521
2522         return mtu ? : dst->dev->mtu;
2523 }
2524
2525 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2526                                           struct sk_buff *skb, u32 mtu)
2527 {
2528 }
2529
2530 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2531                                        struct sk_buff *skb)
2532 {
2533 }
2534
2535 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2536                                           unsigned long old)
2537 {
2538         return NULL;
2539 }
2540
2541 static struct dst_ops ipv4_dst_blackhole_ops = {
2542         .family                 =       AF_INET,
2543         .check                  =       ipv4_blackhole_dst_check,
2544         .mtu                    =       ipv4_blackhole_mtu,
2545         .default_advmss         =       ipv4_default_advmss,
2546         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2547         .redirect               =       ipv4_rt_blackhole_redirect,
2548         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2549         .neigh_lookup           =       ipv4_neigh_lookup,
2550 };
2551
2552 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2553 {
2554         struct rtable *ort = (struct rtable *) dst_orig;
2555         struct rtable *rt;
2556
2557         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2558         if (rt) {
2559                 struct dst_entry *new = &rt->dst;
2560
2561                 new->__use = 1;
2562                 new->input = dst_discard;
2563                 new->output = dst_discard_out;
2564
2565                 new->dev = net->loopback_dev;
2566                 if (new->dev)
2567                         dev_hold(new->dev);
2568
2569                 rt->rt_is_input = ort->rt_is_input;
2570                 rt->rt_iif = ort->rt_iif;
2571                 rt->rt_pmtu = ort->rt_pmtu;
2572                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2573
2574                 rt->rt_genid = rt_genid_ipv4(net);
2575                 rt->rt_flags = ort->rt_flags;
2576                 rt->rt_type = ort->rt_type;
2577                 rt->rt_gateway = ort->rt_gateway;
2578                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2579
2580                 INIT_LIST_HEAD(&rt->rt_uncached);
2581         }
2582
2583         dst_release(dst_orig);
2584
2585         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2586 }
2587
2588 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2589                                     const struct sock *sk)
2590 {
2591         struct rtable *rt = __ip_route_output_key(net, flp4);
2592
2593         if (IS_ERR(rt))
2594                 return rt;
2595
2596         if (flp4->flowi4_proto)
2597                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2598                                                         flowi4_to_flowi(flp4),
2599                                                         sk, 0);
2600
2601         return rt;
2602 }
2603 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2604
2605 /* called with rcu_read_lock held */
2606 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2607                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2608                         struct sk_buff *skb, u32 portid, u32 seq)
2609 {
2610         struct rtmsg *r;
2611         struct nlmsghdr *nlh;
2612         unsigned long expires = 0;
2613         u32 error;
2614         u32 metrics[RTAX_MAX];
2615
2616         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2617         if (!nlh)
2618                 return -EMSGSIZE;
2619
2620         r = nlmsg_data(nlh);
2621         r->rtm_family    = AF_INET;
2622         r->rtm_dst_len  = 32;
2623         r->rtm_src_len  = 0;
2624         r->rtm_tos      = fl4->flowi4_tos;
2625         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2626         if (nla_put_u32(skb, RTA_TABLE, table_id))
2627                 goto nla_put_failure;
2628         r->rtm_type     = rt->rt_type;
2629         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2630         r->rtm_protocol = RTPROT_UNSPEC;
2631         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2632         if (rt->rt_flags & RTCF_NOTIFY)
2633                 r->rtm_flags |= RTM_F_NOTIFY;
2634         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2635                 r->rtm_flags |= RTCF_DOREDIRECT;
2636
2637         if (nla_put_in_addr(skb, RTA_DST, dst))
2638                 goto nla_put_failure;
2639         if (src) {
2640                 r->rtm_src_len = 32;
2641                 if (nla_put_in_addr(skb, RTA_SRC, src))
2642                         goto nla_put_failure;
2643         }
2644         if (rt->dst.dev &&
2645             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2646                 goto nla_put_failure;
2647 #ifdef CONFIG_IP_ROUTE_CLASSID
2648         if (rt->dst.tclassid &&
2649             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2650                 goto nla_put_failure;
2651 #endif
2652         if (!rt_is_input_route(rt) &&
2653             fl4->saddr != src) {
2654                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2655                         goto nla_put_failure;
2656         }
2657         if (rt->rt_uses_gateway &&
2658             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2659                 goto nla_put_failure;
2660
2661         expires = rt->dst.expires;
2662         if (expires) {
2663                 unsigned long now = jiffies;
2664
2665                 if (time_before(now, expires))
2666                         expires -= now;
2667                 else
2668                         expires = 0;
2669         }
2670
2671         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2672         if (rt->rt_pmtu && expires)
2673                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2674         if (rt->rt_mtu_locked && expires)
2675                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2676         if (rtnetlink_put_metrics(skb, metrics) < 0)
2677                 goto nla_put_failure;
2678
2679         if (fl4->flowi4_mark &&
2680             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2681                 goto nla_put_failure;
2682
2683         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2684             nla_put_u32(skb, RTA_UID,
2685                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2686                 goto nla_put_failure;
2687
2688         error = rt->dst.error;
2689
2690         if (rt_is_input_route(rt)) {
2691 #ifdef CONFIG_IP_MROUTE
2692                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2693                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2694                         int err = ipmr_get_route(net, skb,
2695                                                  fl4->saddr, fl4->daddr,
2696                                                  r, portid);
2697
2698                         if (err <= 0) {
2699                                 if (err == 0)
2700                                         return 0;
2701                                 goto nla_put_failure;
2702                         }
2703                 } else
2704 #endif
2705                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2706                                 goto nla_put_failure;
2707         }
2708
2709         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2710                 goto nla_put_failure;
2711
2712         nlmsg_end(skb, nlh);
2713         return 0;
2714
2715 nla_put_failure:
2716         nlmsg_cancel(skb, nlh);
2717         return -EMSGSIZE;
2718 }
2719
2720 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2721                                                    u8 ip_proto, __be16 sport,
2722                                                    __be16 dport)
2723 {
2724         struct sk_buff *skb;
2725         struct iphdr *iph;
2726
2727         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2728         if (!skb)
2729                 return NULL;
2730
2731         /* Reserve room for dummy headers, this skb can pass
2732          * through good chunk of routing engine.
2733          */
2734         skb_reset_mac_header(skb);
2735         skb_reset_network_header(skb);
2736         skb->protocol = htons(ETH_P_IP);
2737         iph = skb_put(skb, sizeof(struct iphdr));
2738         iph->protocol = ip_proto;
2739         iph->saddr = src;
2740         iph->daddr = dst;
2741         iph->version = 0x4;
2742         iph->frag_off = 0;
2743         iph->ihl = 0x5;
2744         skb_set_transport_header(skb, skb->len);
2745
2746         switch (iph->protocol) {
2747         case IPPROTO_UDP: {
2748                 struct udphdr *udph;
2749
2750                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2751                 udph->source = sport;
2752                 udph->dest = dport;
2753                 udph->len = sizeof(struct udphdr);
2754                 udph->check = 0;
2755                 break;
2756         }
2757         case IPPROTO_TCP: {
2758                 struct tcphdr *tcph;
2759
2760                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2761                 tcph->source    = sport;
2762                 tcph->dest      = dport;
2763                 tcph->doff      = sizeof(struct tcphdr) / 4;
2764                 tcph->rst = 1;
2765                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2766                                             src, dst, 0);
2767                 break;
2768         }
2769         case IPPROTO_ICMP: {
2770                 struct icmphdr *icmph;
2771
2772                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2773                 icmph->type = ICMP_ECHO;
2774                 icmph->code = 0;
2775         }
2776         }
2777
2778         return skb;
2779 }
2780
2781 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2782                                        const struct nlmsghdr *nlh,
2783                                        struct nlattr **tb,
2784                                        struct netlink_ext_ack *extack)
2785 {
2786         struct rtmsg *rtm;
2787         int i, err;
2788
2789         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2790                 NL_SET_ERR_MSG(extack,
2791                                "ipv4: Invalid header for route get request");
2792                 return -EINVAL;
2793         }
2794
2795         if (!netlink_strict_get_check(skb))
2796                 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
2797                                    rtm_ipv4_policy, extack);
2798
2799         rtm = nlmsg_data(nlh);
2800         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2801             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2802             rtm->rtm_table || rtm->rtm_protocol ||
2803             rtm->rtm_scope || rtm->rtm_type) {
2804                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2805                 return -EINVAL;
2806         }
2807
2808         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2809                                RTM_F_LOOKUP_TABLE |
2810                                RTM_F_FIB_MATCH)) {
2811                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2812                 return -EINVAL;
2813         }
2814
2815         err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2816                                  rtm_ipv4_policy, extack);
2817         if (err)
2818                 return err;
2819
2820         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2821             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2822                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2823                 return -EINVAL;
2824         }
2825
2826         for (i = 0; i <= RTA_MAX; i++) {
2827                 if (!tb[i])
2828                         continue;
2829
2830                 switch (i) {
2831                 case RTA_IIF:
2832                 case RTA_OIF:
2833                 case RTA_SRC:
2834                 case RTA_DST:
2835                 case RTA_IP_PROTO:
2836                 case RTA_SPORT:
2837                 case RTA_DPORT:
2838                 case RTA_MARK:
2839                 case RTA_UID:
2840                         break;
2841                 default:
2842                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2843                         return -EINVAL;
2844                 }
2845         }
2846
2847         return 0;
2848 }
2849
2850 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2851                              struct netlink_ext_ack *extack)
2852 {
2853         struct net *net = sock_net(in_skb->sk);
2854         struct nlattr *tb[RTA_MAX+1];
2855         u32 table_id = RT_TABLE_MAIN;
2856         __be16 sport = 0, dport = 0;
2857         struct fib_result res = {};
2858         u8 ip_proto = IPPROTO_UDP;
2859         struct rtable *rt = NULL;
2860         struct sk_buff *skb;
2861         struct rtmsg *rtm;
2862         struct flowi4 fl4 = {};
2863         __be32 dst = 0;
2864         __be32 src = 0;
2865         kuid_t uid;
2866         u32 iif;
2867         int err;
2868         int mark;
2869
2870         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2871         if (err < 0)
2872                 return err;
2873
2874         rtm = nlmsg_data(nlh);
2875         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2876         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2877         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2878         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2879         if (tb[RTA_UID])
2880                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2881         else
2882                 uid = (iif ? INVALID_UID : current_uid());
2883
2884         if (tb[RTA_IP_PROTO]) {
2885                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2886                                                   &ip_proto, AF_INET, extack);
2887                 if (err)
2888                         return err;
2889         }
2890
2891         if (tb[RTA_SPORT])
2892                 sport = nla_get_be16(tb[RTA_SPORT]);
2893
2894         if (tb[RTA_DPORT])
2895                 dport = nla_get_be16(tb[RTA_DPORT]);
2896
2897         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2898         if (!skb)
2899                 return -ENOBUFS;
2900
2901         fl4.daddr = dst;
2902         fl4.saddr = src;
2903         fl4.flowi4_tos = rtm->rtm_tos;
2904         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2905         fl4.flowi4_mark = mark;
2906         fl4.flowi4_uid = uid;
2907         if (sport)
2908                 fl4.fl4_sport = sport;
2909         if (dport)
2910                 fl4.fl4_dport = dport;
2911         fl4.flowi4_proto = ip_proto;
2912
2913         rcu_read_lock();
2914
2915         if (iif) {
2916                 struct net_device *dev;
2917
2918                 dev = dev_get_by_index_rcu(net, iif);
2919                 if (!dev) {
2920                         err = -ENODEV;
2921                         goto errout_rcu;
2922                 }
2923
2924                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2925                 skb->dev        = dev;
2926                 skb->mark       = mark;
2927                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2928                                          dev, &res);
2929
2930                 rt = skb_rtable(skb);
2931                 if (err == 0 && rt->dst.error)
2932                         err = -rt->dst.error;
2933         } else {
2934                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2935                 skb->dev = net->loopback_dev;
2936                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2937                 err = 0;
2938                 if (IS_ERR(rt))
2939                         err = PTR_ERR(rt);
2940                 else
2941                         skb_dst_set(skb, &rt->dst);
2942         }
2943
2944         if (err)
2945                 goto errout_rcu;
2946
2947         if (rtm->rtm_flags & RTM_F_NOTIFY)
2948                 rt->rt_flags |= RTCF_NOTIFY;
2949
2950         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2951                 table_id = res.table ? res.table->tb_id : 0;
2952
2953         /* reset skb for netlink reply msg */
2954         skb_trim(skb, 0);
2955         skb_reset_network_header(skb);
2956         skb_reset_transport_header(skb);
2957         skb_reset_mac_header(skb);
2958
2959         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2960                 if (!res.fi) {
2961                         err = fib_props[res.type].error;
2962                         if (!err)
2963                                 err = -EHOSTUNREACH;
2964                         goto errout_rcu;
2965                 }
2966                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2967                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2968                                     rt->rt_type, res.prefix, res.prefixlen,
2969                                     fl4.flowi4_tos, res.fi, 0);
2970         } else {
2971                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2972                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2973         }
2974         if (err < 0)
2975                 goto errout_rcu;
2976
2977         rcu_read_unlock();
2978
2979         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2980
2981 errout_free:
2982         return err;
2983 errout_rcu:
2984         rcu_read_unlock();
2985         kfree_skb(skb);
2986         goto errout_free;
2987 }
2988
2989 void ip_rt_multicast_event(struct in_device *in_dev)
2990 {
2991         rt_cache_flush(dev_net(in_dev->dev));
2992 }
2993
2994 #ifdef CONFIG_SYSCTL
2995 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2996 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2997 static int ip_rt_gc_elasticity __read_mostly    = 8;
2998 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2999
3000 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3001                                         void __user *buffer,
3002                                         size_t *lenp, loff_t *ppos)
3003 {
3004         struct net *net = (struct net *)__ctl->extra1;
3005
3006         if (write) {
3007                 rt_cache_flush(net);
3008                 fnhe_genid_bump(net);
3009                 return 0;
3010         }
3011
3012         return -EINVAL;
3013 }
3014
3015 static struct ctl_table ipv4_route_table[] = {
3016         {
3017                 .procname       = "gc_thresh",
3018                 .data           = &ipv4_dst_ops.gc_thresh,
3019                 .maxlen         = sizeof(int),
3020                 .mode           = 0644,
3021                 .proc_handler   = proc_dointvec,
3022         },
3023         {
3024                 .procname       = "max_size",
3025                 .data           = &ip_rt_max_size,
3026                 .maxlen         = sizeof(int),
3027                 .mode           = 0644,
3028                 .proc_handler   = proc_dointvec,
3029         },
3030         {
3031                 /*  Deprecated. Use gc_min_interval_ms */
3032
3033                 .procname       = "gc_min_interval",
3034                 .data           = &ip_rt_gc_min_interval,
3035                 .maxlen         = sizeof(int),
3036                 .mode           = 0644,
3037                 .proc_handler   = proc_dointvec_jiffies,
3038         },
3039         {
3040                 .procname       = "gc_min_interval_ms",
3041                 .data           = &ip_rt_gc_min_interval,
3042                 .maxlen         = sizeof(int),
3043                 .mode           = 0644,
3044                 .proc_handler   = proc_dointvec_ms_jiffies,
3045         },
3046         {
3047                 .procname       = "gc_timeout",
3048                 .data           = &ip_rt_gc_timeout,
3049                 .maxlen         = sizeof(int),
3050                 .mode           = 0644,
3051                 .proc_handler   = proc_dointvec_jiffies,
3052         },
3053         {
3054                 .procname       = "gc_interval",
3055                 .data           = &ip_rt_gc_interval,
3056                 .maxlen         = sizeof(int),
3057                 .mode           = 0644,
3058                 .proc_handler   = proc_dointvec_jiffies,
3059         },
3060         {
3061                 .procname       = "redirect_load",
3062                 .data           = &ip_rt_redirect_load,
3063                 .maxlen         = sizeof(int),
3064                 .mode           = 0644,
3065                 .proc_handler   = proc_dointvec,
3066         },
3067         {
3068                 .procname       = "redirect_number",
3069                 .data           = &ip_rt_redirect_number,
3070                 .maxlen         = sizeof(int),
3071                 .mode           = 0644,
3072                 .proc_handler   = proc_dointvec,
3073         },
3074         {
3075                 .procname       = "redirect_silence",
3076                 .data           = &ip_rt_redirect_silence,
3077                 .maxlen         = sizeof(int),
3078                 .mode           = 0644,
3079                 .proc_handler   = proc_dointvec,
3080         },
3081         {
3082                 .procname       = "error_cost",
3083                 .data           = &ip_rt_error_cost,
3084                 .maxlen         = sizeof(int),
3085                 .mode           = 0644,
3086                 .proc_handler   = proc_dointvec,
3087         },
3088         {
3089                 .procname       = "error_burst",
3090                 .data           = &ip_rt_error_burst,
3091                 .maxlen         = sizeof(int),
3092                 .mode           = 0644,
3093                 .proc_handler   = proc_dointvec,
3094         },
3095         {
3096                 .procname       = "gc_elasticity",
3097                 .data           = &ip_rt_gc_elasticity,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = proc_dointvec,
3101         },
3102         {
3103                 .procname       = "mtu_expires",
3104                 .data           = &ip_rt_mtu_expires,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec_jiffies,
3108         },
3109         {
3110                 .procname       = "min_pmtu",
3111                 .data           = &ip_rt_min_pmtu,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec_minmax,
3115                 .extra1         = &ip_min_valid_pmtu,
3116         },
3117         {
3118                 .procname       = "min_adv_mss",
3119                 .data           = &ip_rt_min_advmss,
3120                 .maxlen         = sizeof(int),
3121                 .mode           = 0644,
3122                 .proc_handler   = proc_dointvec,
3123         },
3124         { }
3125 };
3126
3127 static struct ctl_table ipv4_route_flush_table[] = {
3128         {
3129                 .procname       = "flush",
3130                 .maxlen         = sizeof(int),
3131                 .mode           = 0200,
3132                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3133         },
3134         { },
3135 };
3136
3137 static __net_init int sysctl_route_net_init(struct net *net)
3138 {
3139         struct ctl_table *tbl;
3140
3141         tbl = ipv4_route_flush_table;
3142         if (!net_eq(net, &init_net)) {
3143                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3144                 if (!tbl)
3145                         goto err_dup;
3146
3147                 /* Don't export sysctls to unprivileged users */
3148                 if (net->user_ns != &init_user_ns)
3149                         tbl[0].procname = NULL;
3150         }
3151         tbl[0].extra1 = net;
3152
3153         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3154         if (!net->ipv4.route_hdr)
3155                 goto err_reg;
3156         return 0;
3157
3158 err_reg:
3159         if (tbl != ipv4_route_flush_table)
3160                 kfree(tbl);
3161 err_dup:
3162         return -ENOMEM;
3163 }
3164
3165 static __net_exit void sysctl_route_net_exit(struct net *net)
3166 {
3167         struct ctl_table *tbl;
3168
3169         tbl = net->ipv4.route_hdr->ctl_table_arg;
3170         unregister_net_sysctl_table(net->ipv4.route_hdr);
3171         BUG_ON(tbl == ipv4_route_flush_table);
3172         kfree(tbl);
3173 }
3174
3175 static __net_initdata struct pernet_operations sysctl_route_ops = {
3176         .init = sysctl_route_net_init,
3177         .exit = sysctl_route_net_exit,
3178 };
3179 #endif
3180
3181 static __net_init int rt_genid_init(struct net *net)
3182 {
3183         atomic_set(&net->ipv4.rt_genid, 0);
3184         atomic_set(&net->fnhe_genid, 0);
3185         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3186         return 0;
3187 }
3188
3189 static __net_initdata struct pernet_operations rt_genid_ops = {
3190         .init = rt_genid_init,
3191 };
3192
3193 static int __net_init ipv4_inetpeer_init(struct net *net)
3194 {
3195         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3196
3197         if (!bp)
3198                 return -ENOMEM;
3199         inet_peer_base_init(bp);
3200         net->ipv4.peers = bp;
3201         return 0;
3202 }
3203
3204 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3205 {
3206         struct inet_peer_base *bp = net->ipv4.peers;
3207
3208         net->ipv4.peers = NULL;
3209         inetpeer_invalidate_tree(bp);
3210         kfree(bp);
3211 }
3212
3213 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3214         .init   =       ipv4_inetpeer_init,
3215         .exit   =       ipv4_inetpeer_exit,
3216 };
3217
3218 #ifdef CONFIG_IP_ROUTE_CLASSID
3219 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3220 #endif /* CONFIG_IP_ROUTE_CLASSID */
3221
3222 int __init ip_rt_init(void)
3223 {
3224         int cpu;
3225
3226         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3227                                   GFP_KERNEL);
3228         if (!ip_idents)
3229                 panic("IP: failed to allocate ip_idents\n");
3230
3231         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3232
3233         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3234         if (!ip_tstamps)
3235                 panic("IP: failed to allocate ip_tstamps\n");
3236
3237         for_each_possible_cpu(cpu) {
3238                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3239
3240                 INIT_LIST_HEAD(&ul->head);
3241                 spin_lock_init(&ul->lock);
3242         }
3243 #ifdef CONFIG_IP_ROUTE_CLASSID
3244         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3245         if (!ip_rt_acct)
3246                 panic("IP: failed to allocate ip_rt_acct\n");
3247 #endif
3248
3249         ipv4_dst_ops.kmem_cachep =
3250                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3251                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3252
3253         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3254
3255         if (dst_entries_init(&ipv4_dst_ops) < 0)
3256                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3257
3258         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3259                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3260
3261         ipv4_dst_ops.gc_thresh = ~0;
3262         ip_rt_max_size = INT_MAX;
3263
3264         devinet_init();
3265         ip_fib_init();
3266
3267         if (ip_rt_proc_init())
3268                 pr_err("Unable to create route proc files\n");
3269 #ifdef CONFIG_XFRM
3270         xfrm_init();
3271         xfrm4_init();
3272 #endif
3273         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3274                       RTNL_FLAG_DOIT_UNLOCKED);
3275
3276 #ifdef CONFIG_SYSCTL
3277         register_pernet_subsys(&sysctl_route_ops);
3278 #endif
3279         register_pernet_subsys(&rt_genid_ops);
3280         register_pernet_subsys(&ipv4_inetpeer_ops);
3281         return 0;
3282 }
3283
3284 #ifdef CONFIG_SYSCTL
3285 /*
3286  * We really need to sanitize the damn ipv4 init order, then all
3287  * this nonsense will go away.
3288  */
3289 void __init ip_static_sysctl_init(void)
3290 {
3291         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3292 }
3293 #endif