]> asedeno.scripts.mit.edu Git - linux.git/blob - net/ipv4/route.c
Merge branch 'next' into for-linus
[linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115
116 #include "fib_lookup.h"
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363
364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 {
366         return single_open(file, rt_acct_proc_show, NULL);
367 }
368
369 static const struct file_operations rt_acct_proc_fops = {
370         .open           = rt_acct_proc_open,
371         .read           = seq_read,
372         .llseek         = seq_lseek,
373         .release        = single_release,
374 };
375 #endif
376
377 static int __net_init ip_rt_do_proc_init(struct net *net)
378 {
379         struct proc_dir_entry *pde;
380
381         pde = proc_create("rt_cache", 0444, net->proc_net,
382                           &rt_cache_seq_fops);
383         if (!pde)
384                 goto err1;
385
386         pde = proc_create("rt_cache", 0444,
387                           net->proc_net_stat, &rt_cpu_seq_fops);
388         if (!pde)
389                 goto err2;
390
391 #ifdef CONFIG_IP_ROUTE_CLASSID
392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
393         if (!pde)
394                 goto err3;
395 #endif
396         return 0;
397
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 err3:
400         remove_proc_entry("rt_cache", net->proc_net_stat);
401 #endif
402 err2:
403         remove_proc_entry("rt_cache", net->proc_net);
404 err1:
405         return -ENOMEM;
406 }
407
408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 {
410         remove_proc_entry("rt_cache", net->proc_net_stat);
411         remove_proc_entry("rt_cache", net->proc_net);
412 #ifdef CONFIG_IP_ROUTE_CLASSID
413         remove_proc_entry("rt_acct", net->proc_net);
414 #endif
415 }
416
417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
418         .init = ip_rt_do_proc_init,
419         .exit = ip_rt_do_proc_exit,
420 };
421
422 static int __init ip_rt_proc_init(void)
423 {
424         return register_pernet_subsys(&ip_rt_proc_ops);
425 }
426
427 #else
428 static inline int ip_rt_proc_init(void)
429 {
430         return 0;
431 }
432 #endif /* CONFIG_PROC_FS */
433
434 static inline bool rt_is_expired(const struct rtable *rth)
435 {
436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
437 }
438
439 void rt_cache_flush(struct net *net)
440 {
441         rt_genid_bump_ipv4(net);
442 }
443
444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
445                                            struct sk_buff *skb,
446                                            const void *daddr)
447 {
448         struct net_device *dev = dst->dev;
449         const __be32 *pkey = daddr;
450         const struct rtable *rt;
451         struct neighbour *n;
452
453         rt = (const struct rtable *) dst;
454         if (rt->rt_gateway)
455                 pkey = (const __be32 *) &rt->rt_gateway;
456         else if (skb)
457                 pkey = &ip_hdr(skb)->daddr;
458
459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
460         if (n)
461                 return n;
462         return neigh_create(&arp_tbl, pkey, dev);
463 }
464
465 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
466 {
467         struct net_device *dev = dst->dev;
468         const __be32 *pkey = daddr;
469         const struct rtable *rt;
470
471         rt = (const struct rtable *)dst;
472         if (rt->rt_gateway)
473                 pkey = (const __be32 *)&rt->rt_gateway;
474         else if (!daddr ||
475                  (rt->rt_flags &
476                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
477                 return;
478
479         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
480 }
481
482 #define IP_IDENTS_SZ 2048u
483
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
494         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
495         u32 old = READ_ONCE(*p_tstamp);
496         u32 now = (u32)jiffies;
497         u32 new, delta = 0;
498
499         if (old != now && cmpxchg(p_tstamp, old, now) == old)
500                 delta = prandom_u32_max(now - old);
501
502         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
503         do {
504                 old = (u32)atomic_read(p_id);
505                 new = old + delta + segs;
506         } while (atomic_cmpxchg(p_id, old, new) != old);
507
508         return new - segs;
509 }
510 EXPORT_SYMBOL(ip_idents_reserve);
511
512 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
513 {
514         static u32 ip_idents_hashrnd __read_mostly;
515         u32 hash, id;
516
517         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
518
519         hash = jhash_3words((__force u32)iph->daddr,
520                             (__force u32)iph->saddr,
521                             iph->protocol ^ net_hash_mix(net),
522                             ip_idents_hashrnd);
523         id = ip_idents_reserve(hash, segs);
524         iph->id = htons(id);
525 }
526 EXPORT_SYMBOL(__ip_select_ident);
527
528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
529                              const struct sock *sk,
530                              const struct iphdr *iph,
531                              int oif, u8 tos,
532                              u8 prot, u32 mark, int flow_flags)
533 {
534         if (sk) {
535                 const struct inet_sock *inet = inet_sk(sk);
536
537                 oif = sk->sk_bound_dev_if;
538                 mark = sk->sk_mark;
539                 tos = RT_CONN_FLAGS(sk);
540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
541         }
542         flowi4_init_output(fl4, oif, mark, tos,
543                            RT_SCOPE_UNIVERSE, prot,
544                            flow_flags,
545                            iph->daddr, iph->saddr, 0, 0,
546                            sock_net_uid(net, sk));
547 }
548
549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550                                const struct sock *sk)
551 {
552         const struct net *net = dev_net(skb->dev);
553         const struct iphdr *iph = ip_hdr(skb);
554         int oif = skb->dev->ifindex;
555         u8 tos = RT_TOS(iph->tos);
556         u8 prot = iph->protocol;
557         u32 mark = skb->mark;
558
559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
560 }
561
562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565         const struct ip_options_rcu *inet_opt;
566         __be32 daddr = inet->inet_daddr;
567
568         rcu_read_lock();
569         inet_opt = rcu_dereference(inet->inet_opt);
570         if (inet_opt && inet_opt->opt.srr)
571                 daddr = inet_opt->opt.faddr;
572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
575                            inet_sk_flowi_flags(sk),
576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
577         rcu_read_unlock();
578 }
579
580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
581                                  const struct sk_buff *skb)
582 {
583         if (skb)
584                 build_skb_flow_key(fl4, skb, sk);
585         else
586                 build_sk_flow_key(fl4, sk);
587 }
588
589 static DEFINE_SPINLOCK(fnhe_lock);
590
591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
592 {
593         struct rtable *rt;
594
595         rt = rcu_dereference(fnhe->fnhe_rth_input);
596         if (rt) {
597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
598                 dst_dev_put(&rt->dst);
599                 dst_release(&rt->dst);
600         }
601         rt = rcu_dereference(fnhe->fnhe_rth_output);
602         if (rt) {
603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
604                 dst_dev_put(&rt->dst);
605                 dst_release(&rt->dst);
606         }
607 }
608
609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
610 {
611         struct fib_nh_exception *fnhe, *oldest;
612
613         oldest = rcu_dereference(hash->chain);
614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
617                         oldest = fnhe;
618         }
619         fnhe_flush_routes(oldest);
620         return oldest;
621 }
622
623 static inline u32 fnhe_hashfun(__be32 daddr)
624 {
625         static u32 fnhe_hashrnd __read_mostly;
626         u32 hval;
627
628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
630         return hash_32(hval, FNHE_HASH_SHIFT);
631 }
632
633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
634 {
635         rt->rt_pmtu = fnhe->fnhe_pmtu;
636         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
637         rt->dst.expires = fnhe->fnhe_expires;
638
639         if (fnhe->fnhe_gw) {
640                 rt->rt_flags |= RTCF_REDIRECTED;
641                 rt->rt_gateway = fnhe->fnhe_gw;
642                 rt->rt_uses_gateway = 1;
643         }
644 }
645
646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
647                                   u32 pmtu, bool lock, unsigned long expires)
648 {
649         struct fnhe_hash_bucket *hash;
650         struct fib_nh_exception *fnhe;
651         struct rtable *rt;
652         u32 genid, hval;
653         unsigned int i;
654         int depth;
655
656         genid = fnhe_genid(dev_net(nh->nh_dev));
657         hval = fnhe_hashfun(daddr);
658
659         spin_lock_bh(&fnhe_lock);
660
661         hash = rcu_dereference(nh->nh_exceptions);
662         if (!hash) {
663                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
664                 if (!hash)
665                         goto out_unlock;
666                 rcu_assign_pointer(nh->nh_exceptions, hash);
667         }
668
669         hash += hval;
670
671         depth = 0;
672         for (fnhe = rcu_dereference(hash->chain); fnhe;
673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
674                 if (fnhe->fnhe_daddr == daddr)
675                         break;
676                 depth++;
677         }
678
679         if (fnhe) {
680                 if (fnhe->fnhe_genid != genid)
681                         fnhe->fnhe_genid = genid;
682                 if (gw)
683                         fnhe->fnhe_gw = gw;
684                 if (pmtu) {
685                         fnhe->fnhe_pmtu = pmtu;
686                         fnhe->fnhe_mtu_locked = lock;
687                 }
688                 fnhe->fnhe_expires = max(1UL, expires);
689                 /* Update all cached dsts too */
690                 rt = rcu_dereference(fnhe->fnhe_rth_input);
691                 if (rt)
692                         fill_route_from_fnhe(rt, fnhe);
693                 rt = rcu_dereference(fnhe->fnhe_rth_output);
694                 if (rt)
695                         fill_route_from_fnhe(rt, fnhe);
696         } else {
697                 if (depth > FNHE_RECLAIM_DEPTH)
698                         fnhe = fnhe_oldest(hash);
699                 else {
700                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
701                         if (!fnhe)
702                                 goto out_unlock;
703
704                         fnhe->fnhe_next = hash->chain;
705                         rcu_assign_pointer(hash->chain, fnhe);
706                 }
707                 fnhe->fnhe_genid = genid;
708                 fnhe->fnhe_daddr = daddr;
709                 fnhe->fnhe_gw = gw;
710                 fnhe->fnhe_pmtu = pmtu;
711                 fnhe->fnhe_mtu_locked = lock;
712                 fnhe->fnhe_expires = max(1UL, expires);
713
714                 /* Exception created; mark the cached routes for the nexthop
715                  * stale, so anyone caching it rechecks if this exception
716                  * applies to them.
717                  */
718                 rt = rcu_dereference(nh->nh_rth_input);
719                 if (rt)
720                         rt->dst.obsolete = DST_OBSOLETE_KILL;
721
722                 for_each_possible_cpu(i) {
723                         struct rtable __rcu **prt;
724                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
725                         rt = rcu_dereference(*prt);
726                         if (rt)
727                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
728                 }
729         }
730
731         fnhe->fnhe_stamp = jiffies;
732
733 out_unlock:
734         spin_unlock_bh(&fnhe_lock);
735 }
736
737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
738                              bool kill_route)
739 {
740         __be32 new_gw = icmp_hdr(skb)->un.gateway;
741         __be32 old_gw = ip_hdr(skb)->saddr;
742         struct net_device *dev = skb->dev;
743         struct in_device *in_dev;
744         struct fib_result res;
745         struct neighbour *n;
746         struct net *net;
747
748         switch (icmp_hdr(skb)->code & 7) {
749         case ICMP_REDIR_NET:
750         case ICMP_REDIR_NETTOS:
751         case ICMP_REDIR_HOST:
752         case ICMP_REDIR_HOSTTOS:
753                 break;
754
755         default:
756                 return;
757         }
758
759         if (rt->rt_gateway != old_gw)
760                 return;
761
762         in_dev = __in_dev_get_rcu(dev);
763         if (!in_dev)
764                 return;
765
766         net = dev_net(dev);
767         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
768             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
769             ipv4_is_zeronet(new_gw))
770                 goto reject_redirect;
771
772         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
773                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
774                         goto reject_redirect;
775                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
776                         goto reject_redirect;
777         } else {
778                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
779                         goto reject_redirect;
780         }
781
782         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
783         if (!n)
784                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
785         if (!IS_ERR(n)) {
786                 if (!(n->nud_state & NUD_VALID)) {
787                         neigh_event_send(n, NULL);
788                 } else {
789                         if (fib_lookup(net, fl4, &res, 0) == 0) {
790                                 struct fib_nh *nh = &FIB_RES_NH(res);
791
792                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
793                                                 0, false,
794                                                 jiffies + ip_rt_gc_timeout);
795                         }
796                         if (kill_route)
797                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
798                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
799                 }
800                 neigh_release(n);
801         }
802         return;
803
804 reject_redirect:
805 #ifdef CONFIG_IP_ROUTE_VERBOSE
806         if (IN_DEV_LOG_MARTIANS(in_dev)) {
807                 const struct iphdr *iph = (const struct iphdr *) skb->data;
808                 __be32 daddr = iph->daddr;
809                 __be32 saddr = iph->saddr;
810
811                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
812                                      "  Advised path = %pI4 -> %pI4\n",
813                                      &old_gw, dev->name, &new_gw,
814                                      &saddr, &daddr);
815         }
816 #endif
817         ;
818 }
819
820 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
821 {
822         struct rtable *rt;
823         struct flowi4 fl4;
824         const struct iphdr *iph = (const struct iphdr *) skb->data;
825         struct net *net = dev_net(skb->dev);
826         int oif = skb->dev->ifindex;
827         u8 tos = RT_TOS(iph->tos);
828         u8 prot = iph->protocol;
829         u32 mark = skb->mark;
830
831         rt = (struct rtable *) dst;
832
833         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
834         __ip_do_redirect(rt, skb, &fl4, true);
835 }
836
837 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
838 {
839         struct rtable *rt = (struct rtable *)dst;
840         struct dst_entry *ret = dst;
841
842         if (rt) {
843                 if (dst->obsolete > 0) {
844                         ip_rt_put(rt);
845                         ret = NULL;
846                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
847                            rt->dst.expires) {
848                         ip_rt_put(rt);
849                         ret = NULL;
850                 }
851         }
852         return ret;
853 }
854
855 /*
856  * Algorithm:
857  *      1. The first ip_rt_redirect_number redirects are sent
858  *         with exponential backoff, then we stop sending them at all,
859  *         assuming that the host ignores our redirects.
860  *      2. If we did not see packets requiring redirects
861  *         during ip_rt_redirect_silence, we assume that the host
862  *         forgot redirected route and start to send redirects again.
863  *
864  * This algorithm is much cheaper and more intelligent than dumb load limiting
865  * in icmp.c.
866  *
867  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
868  * and "frag. need" (breaks PMTU discovery) in icmp.c.
869  */
870
871 void ip_rt_send_redirect(struct sk_buff *skb)
872 {
873         struct rtable *rt = skb_rtable(skb);
874         struct in_device *in_dev;
875         struct inet_peer *peer;
876         struct net *net;
877         int log_martians;
878         int vif;
879
880         rcu_read_lock();
881         in_dev = __in_dev_get_rcu(rt->dst.dev);
882         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
883                 rcu_read_unlock();
884                 return;
885         }
886         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
887         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
888         rcu_read_unlock();
889
890         net = dev_net(rt->dst.dev);
891         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
892         if (!peer) {
893                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
894                           rt_nexthop(rt, ip_hdr(skb)->daddr));
895                 return;
896         }
897
898         /* No redirected packets during ip_rt_redirect_silence;
899          * reset the algorithm.
900          */
901         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
902                 peer->rate_tokens = 0;
903
904         /* Too many ignored redirects; do not send anything
905          * set dst.rate_last to the last seen redirected packet.
906          */
907         if (peer->rate_tokens >= ip_rt_redirect_number) {
908                 peer->rate_last = jiffies;
909                 goto out_put_peer;
910         }
911
912         /* Check for load limit; set rate_last to the latest sent
913          * redirect.
914          */
915         if (peer->rate_tokens == 0 ||
916             time_after(jiffies,
917                        (peer->rate_last +
918                         (ip_rt_redirect_load << peer->rate_tokens)))) {
919                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
920
921                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
922                 peer->rate_last = jiffies;
923                 ++peer->rate_tokens;
924 #ifdef CONFIG_IP_ROUTE_VERBOSE
925                 if (log_martians &&
926                     peer->rate_tokens == ip_rt_redirect_number)
927                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
928                                              &ip_hdr(skb)->saddr, inet_iif(skb),
929                                              &ip_hdr(skb)->daddr, &gw);
930 #endif
931         }
932 out_put_peer:
933         inet_putpeer(peer);
934 }
935
936 static int ip_error(struct sk_buff *skb)
937 {
938         struct rtable *rt = skb_rtable(skb);
939         struct net_device *dev = skb->dev;
940         struct in_device *in_dev;
941         struct inet_peer *peer;
942         unsigned long now;
943         struct net *net;
944         bool send;
945         int code;
946
947         if (netif_is_l3_master(skb->dev)) {
948                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949                 if (!dev)
950                         goto out;
951         }
952
953         in_dev = __in_dev_get_rcu(dev);
954
955         /* IP on this device is disabled. */
956         if (!in_dev)
957                 goto out;
958
959         net = dev_net(rt->dst.dev);
960         if (!IN_DEV_FORWARD(in_dev)) {
961                 switch (rt->dst.error) {
962                 case EHOSTUNREACH:
963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
964                         break;
965
966                 case ENETUNREACH:
967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968                         break;
969                 }
970                 goto out;
971         }
972
973         switch (rt->dst.error) {
974         case EINVAL:
975         default:
976                 goto out;
977         case EHOSTUNREACH:
978                 code = ICMP_HOST_UNREACH;
979                 break;
980         case ENETUNREACH:
981                 code = ICMP_NET_UNREACH;
982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
983                 break;
984         case EACCES:
985                 code = ICMP_PKT_FILTERED;
986                 break;
987         }
988
989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
990                                l3mdev_master_ifindex(skb->dev), 1);
991
992         send = true;
993         if (peer) {
994                 now = jiffies;
995                 peer->rate_tokens += now - peer->rate_last;
996                 if (peer->rate_tokens > ip_rt_error_burst)
997                         peer->rate_tokens = ip_rt_error_burst;
998                 peer->rate_last = now;
999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         struct fib_result res;
1016         bool lock = false;
1017
1018         if (ip_mtu_locked(dst))
1019                 return;
1020
1021         if (ipv4_mtu(dst) < mtu)
1022                 return;
1023
1024         if (mtu < ip_rt_min_pmtu) {
1025                 lock = true;
1026                 mtu = ip_rt_min_pmtu;
1027         }
1028
1029         if (rt->rt_pmtu == mtu &&
1030             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031                 return;
1032
1033         rcu_read_lock();
1034         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035                 struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038                                       jiffies + ip_rt_mtu_expires);
1039         }
1040         rcu_read_unlock();
1041 }
1042
1043 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044                               struct sk_buff *skb, u32 mtu)
1045 {
1046         struct rtable *rt = (struct rtable *) dst;
1047         struct flowi4 fl4;
1048
1049         ip_rt_build_flow_key(&fl4, sk, skb);
1050         __ip_rt_update_pmtu(rt, &fl4, mtu);
1051 }
1052
1053 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054                       int oif, u32 mark, u8 protocol, int flow_flags)
1055 {
1056         const struct iphdr *iph = (const struct iphdr *) skb->data;
1057         struct flowi4 fl4;
1058         struct rtable *rt;
1059
1060         if (!mark)
1061                 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063         __build_flow_key(net, &fl4, NULL, iph, oif,
1064                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1065         rt = __ip_route_output_key(net, &fl4);
1066         if (!IS_ERR(rt)) {
1067                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1068                 ip_rt_put(rt);
1069         }
1070 }
1071 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081         if (!fl4.flowi4_mark)
1082                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090
1091 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093         const struct iphdr *iph = (const struct iphdr *) skb->data;
1094         struct flowi4 fl4;
1095         struct rtable *rt;
1096         struct dst_entry *odst = NULL;
1097         bool new = false;
1098         struct net *net = sock_net(sk);
1099
1100         bh_lock_sock(sk);
1101
1102         if (!ip_sk_accept_pmtu(sk))
1103                 goto out;
1104
1105         odst = sk_dst_get(sk);
1106
1107         if (sock_owned_by_user(sk) || !odst) {
1108                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1109                 goto out;
1110         }
1111
1112         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114         rt = (struct rtable *)odst;
1115         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                 if (IS_ERR(rt))
1118                         goto out;
1119
1120                 new = true;
1121         }
1122
1123         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125         if (!dst_check(&rt->dst, 0)) {
1126                 if (new)
1127                         dst_release(&rt->dst);
1128
1129                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130                 if (IS_ERR(rt))
1131                         goto out;
1132
1133                 new = true;
1134         }
1135
1136         if (new)
1137                 sk_dst_set(sk, &rt->dst);
1138
1139 out:
1140         bh_unlock_sock(sk);
1141         dst_release(odst);
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146                    int oif, u32 mark, u8 protocol, int flow_flags)
1147 {
1148         const struct iphdr *iph = (const struct iphdr *) skb->data;
1149         struct flowi4 fl4;
1150         struct rtable *rt;
1151
1152         __build_flow_key(net, &fl4, NULL, iph, oif,
1153                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1154         rt = __ip_route_output_key(net, &fl4);
1155         if (!IS_ERR(rt)) {
1156                 __ip_do_redirect(rt, skb, &fl4, false);
1157                 ip_rt_put(rt);
1158         }
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163 {
1164         const struct iphdr *iph = (const struct iphdr *) skb->data;
1165         struct flowi4 fl4;
1166         struct rtable *rt;
1167         struct net *net = sock_net(sk);
1168
1169         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170         rt = __ip_route_output_key(net, &fl4);
1171         if (!IS_ERR(rt)) {
1172                 __ip_do_redirect(rt, skb, &fl4, false);
1173                 ip_rt_put(rt);
1174         }
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179 {
1180         struct rtable *rt = (struct rtable *) dst;
1181
1182         /* All IPV4 dsts are created with ->obsolete set to the value
1183          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184          * into this function always.
1185          *
1186          * When a PMTU/redirect information update invalidates a route,
1187          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188          * DST_OBSOLETE_DEAD by dst_free().
1189          */
1190         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191                 return NULL;
1192         return dst;
1193 }
1194
1195 static void ipv4_link_failure(struct sk_buff *skb)
1196 {
1197         struct rtable *rt;
1198
1199         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201         rt = skb_rtable(skb);
1202         if (rt)
1203                 dst_set_expires(&rt->dst, 0);
1204 }
1205
1206 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207 {
1208         pr_debug("%s: %pI4 -> %pI4, %s\n",
1209                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210                  skb->dev ? skb->dev->name : "?");
1211         kfree_skb(skb);
1212         WARN_ON(1);
1213         return 0;
1214 }
1215
1216 /*
1217    We do not cache source address of outgoing interface,
1218    because it is used only by IP RR, TS and SRR options,
1219    so that it out of fast path.
1220
1221    BTW remember: "addr" is allowed to be not aligned
1222    in IP options!
1223  */
1224
1225 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226 {
1227         __be32 src;
1228
1229         if (rt_is_output_route(rt))
1230                 src = ip_hdr(skb)->saddr;
1231         else {
1232                 struct fib_result res;
1233                 struct flowi4 fl4;
1234                 struct iphdr *iph;
1235
1236                 iph = ip_hdr(skb);
1237
1238                 memset(&fl4, 0, sizeof(fl4));
1239                 fl4.daddr = iph->daddr;
1240                 fl4.saddr = iph->saddr;
1241                 fl4.flowi4_tos = RT_TOS(iph->tos);
1242                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1243                 fl4.flowi4_iif = skb->dev->ifindex;
1244                 fl4.flowi4_mark = skb->mark;
1245
1246                 rcu_read_lock();
1247                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249                 else
1250                         src = inet_select_addr(rt->dst.dev,
1251                                                rt_nexthop(rt, iph->daddr),
1252                                                RT_SCOPE_UNIVERSE);
1253                 rcu_read_unlock();
1254         }
1255         memcpy(addr, &src, 4);
1256 }
1257
1258 #ifdef CONFIG_IP_ROUTE_CLASSID
1259 static void set_class_tag(struct rtable *rt, u32 tag)
1260 {
1261         if (!(rt->dst.tclassid & 0xFFFF))
1262                 rt->dst.tclassid |= tag & 0xFFFF;
1263         if (!(rt->dst.tclassid & 0xFFFF0000))
1264                 rt->dst.tclassid |= tag & 0xFFFF0000;
1265 }
1266 #endif
1267
1268 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269 {
1270         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272                                     ip_rt_min_advmss);
1273
1274         return min(advmss, IPV4_MAX_PMTU - header_size);
1275 }
1276
1277 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278 {
1279         const struct rtable *rt = (const struct rtable *) dst;
1280         unsigned int mtu = rt->rt_pmtu;
1281
1282         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283                 mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285         if (mtu)
1286                 return mtu;
1287
1288         mtu = READ_ONCE(dst->dev->mtu);
1289
1290         if (unlikely(ip_mtu_locked(dst))) {
1291                 if (rt->rt_uses_gateway && mtu > 576)
1292                         mtu = 576;
1293         }
1294
1295         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298 }
1299
1300 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301 {
1302         struct fnhe_hash_bucket *hash;
1303         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304         u32 hval = fnhe_hashfun(daddr);
1305
1306         spin_lock_bh(&fnhe_lock);
1307
1308         hash = rcu_dereference_protected(nh->nh_exceptions,
1309                                          lockdep_is_held(&fnhe_lock));
1310         hash += hval;
1311
1312         fnhe_p = &hash->chain;
1313         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314         while (fnhe) {
1315                 if (fnhe->fnhe_daddr == daddr) {
1316                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1318                         fnhe_flush_routes(fnhe);
1319                         kfree_rcu(fnhe, rcu);
1320                         break;
1321                 }
1322                 fnhe_p = &fnhe->fnhe_next;
1323                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324                                                  lockdep_is_held(&fnhe_lock));
1325         }
1326
1327         spin_unlock_bh(&fnhe_lock);
1328 }
1329
1330 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1331 {
1332         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333         struct fib_nh_exception *fnhe;
1334         u32 hval;
1335
1336         if (!hash)
1337                 return NULL;
1338
1339         hval = fnhe_hashfun(daddr);
1340
1341         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         if (fnhe->fnhe_expires &&
1345                             time_after(jiffies, fnhe->fnhe_expires)) {
1346                                 ip_del_fnhe(nh, daddr);
1347                                 break;
1348                         }
1349                         return fnhe;
1350                 }
1351         }
1352         return NULL;
1353 }
1354
1355 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356                               __be32 daddr, const bool do_cache)
1357 {
1358         bool ret = false;
1359
1360         spin_lock_bh(&fnhe_lock);
1361
1362         if (daddr == fnhe->fnhe_daddr) {
1363                 struct rtable __rcu **porig;
1364                 struct rtable *orig;
1365                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367                 if (rt_is_input_route(rt))
1368                         porig = &fnhe->fnhe_rth_input;
1369                 else
1370                         porig = &fnhe->fnhe_rth_output;
1371                 orig = rcu_dereference(*porig);
1372
1373                 if (fnhe->fnhe_genid != genid) {
1374                         fnhe->fnhe_genid = genid;
1375                         fnhe->fnhe_gw = 0;
1376                         fnhe->fnhe_pmtu = 0;
1377                         fnhe->fnhe_expires = 0;
1378                         fnhe->fnhe_mtu_locked = false;
1379                         fnhe_flush_routes(fnhe);
1380                         orig = NULL;
1381                 }
1382                 fill_route_from_fnhe(rt, fnhe);
1383                 if (!rt->rt_gateway)
1384                         rt->rt_gateway = daddr;
1385
1386                 if (do_cache) {
1387                         dst_hold(&rt->dst);
1388                         rcu_assign_pointer(*porig, rt);
1389                         if (orig) {
1390                                 dst_dev_put(&orig->dst);
1391                                 dst_release(&orig->dst);
1392                         }
1393                         ret = true;
1394                 }
1395
1396                 fnhe->fnhe_stamp = jiffies;
1397         }
1398         spin_unlock_bh(&fnhe_lock);
1399
1400         return ret;
1401 }
1402
1403 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1404 {
1405         struct rtable *orig, *prev, **p;
1406         bool ret = true;
1407
1408         if (rt_is_input_route(rt)) {
1409                 p = (struct rtable **)&nh->nh_rth_input;
1410         } else {
1411                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412         }
1413         orig = *p;
1414
1415         /* hold dst before doing cmpxchg() to avoid race condition
1416          * on this dst
1417          */
1418         dst_hold(&rt->dst);
1419         prev = cmpxchg(p, orig, rt);
1420         if (prev == orig) {
1421                 if (orig) {
1422                         dst_dev_put(&orig->dst);
1423                         dst_release(&orig->dst);
1424                 }
1425         } else {
1426                 dst_release(&rt->dst);
1427                 ret = false;
1428         }
1429
1430         return ret;
1431 }
1432
1433 struct uncached_list {
1434         spinlock_t              lock;
1435         struct list_head        head;
1436 };
1437
1438 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440 void rt_add_uncached_list(struct rtable *rt)
1441 {
1442         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444         rt->rt_uncached_list = ul;
1445
1446         spin_lock_bh(&ul->lock);
1447         list_add_tail(&rt->rt_uncached, &ul->head);
1448         spin_unlock_bh(&ul->lock);
1449 }
1450
1451 void rt_del_uncached_list(struct rtable *rt)
1452 {
1453         if (!list_empty(&rt->rt_uncached)) {
1454                 struct uncached_list *ul = rt->rt_uncached_list;
1455
1456                 spin_lock_bh(&ul->lock);
1457                 list_del(&rt->rt_uncached);
1458                 spin_unlock_bh(&ul->lock);
1459         }
1460 }
1461
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1463 {
1464         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465         struct rtable *rt = (struct rtable *)dst;
1466
1467         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468                 kfree(p);
1469
1470         rt_del_uncached_list(rt);
1471 }
1472
1473 void rt_flush_dev(struct net_device *dev)
1474 {
1475         struct net *net = dev_net(dev);
1476         struct rtable *rt;
1477         int cpu;
1478
1479         for_each_possible_cpu(cpu) {
1480                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482                 spin_lock_bh(&ul->lock);
1483                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1484                         if (rt->dst.dev != dev)
1485                                 continue;
1486                         rt->dst.dev = net->loopback_dev;
1487                         dev_hold(rt->dst.dev);
1488                         dev_put(dev);
1489                 }
1490                 spin_unlock_bh(&ul->lock);
1491         }
1492 }
1493
1494 static bool rt_cache_valid(const struct rtable *rt)
1495 {
1496         return  rt &&
1497                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498                 !rt_is_expired(rt);
1499 }
1500
1501 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502                            const struct fib_result *res,
1503                            struct fib_nh_exception *fnhe,
1504                            struct fib_info *fi, u16 type, u32 itag,
1505                            const bool do_cache)
1506 {
1507         bool cached = false;
1508
1509         if (fi) {
1510                 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513                         rt->rt_gateway = nh->nh_gw;
1514                         rt->rt_uses_gateway = 1;
1515                 }
1516                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517                 if (fi->fib_metrics != &dst_default_metrics) {
1518                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519                         refcount_inc(&fi->fib_metrics->refcnt);
1520                 }
1521 #ifdef CONFIG_IP_ROUTE_CLASSID
1522                 rt->dst.tclassid = nh->nh_tclassid;
1523 #endif
1524                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525                 if (unlikely(fnhe))
1526                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527                 else if (do_cache)
1528                         cached = rt_cache_route(nh, rt);
1529                 if (unlikely(!cached)) {
1530                         /* Routes we intend to cache in nexthop exception or
1531                          * FIB nexthop have the DST_NOCACHE bit clear.
1532                          * However, if we are unsuccessful at storing this
1533                          * route into the cache we really need to set it.
1534                          */
1535                         if (!rt->rt_gateway)
1536                                 rt->rt_gateway = daddr;
1537                         rt_add_uncached_list(rt);
1538                 }
1539         } else
1540                 rt_add_uncached_list(rt);
1541
1542 #ifdef CONFIG_IP_ROUTE_CLASSID
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544         set_class_tag(rt, res->tclassid);
1545 #endif
1546         set_class_tag(rt, itag);
1547 #endif
1548 }
1549
1550 struct rtable *rt_dst_alloc(struct net_device *dev,
1551                             unsigned int flags, u16 type,
1552                             bool nopolicy, bool noxfrm, bool will_cache)
1553 {
1554         struct rtable *rt;
1555
1556         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557                        (will_cache ? 0 : DST_HOST) |
1558                        (nopolicy ? DST_NOPOLICY : 0) |
1559                        (noxfrm ? DST_NOXFRM : 0));
1560
1561         if (rt) {
1562                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563                 rt->rt_flags = flags;
1564                 rt->rt_type = type;
1565                 rt->rt_is_input = 0;
1566                 rt->rt_iif = 0;
1567                 rt->rt_pmtu = 0;
1568                 rt->rt_mtu_locked = 0;
1569                 rt->rt_gateway = 0;
1570                 rt->rt_uses_gateway = 0;
1571                 INIT_LIST_HEAD(&rt->rt_uncached);
1572
1573                 rt->dst.output = ip_output;
1574                 if (flags & RTCF_LOCAL)
1575                         rt->dst.input = ip_local_deliver;
1576         }
1577
1578         return rt;
1579 }
1580 EXPORT_SYMBOL(rt_dst_alloc);
1581
1582 /* called in rcu_read_lock() section */
1583 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584                           u8 tos, struct net_device *dev,
1585                           struct in_device *in_dev, u32 *itag)
1586 {
1587         int err;
1588
1589         /* Primary sanity checks. */
1590         if (!in_dev)
1591                 return -EINVAL;
1592
1593         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1594             skb->protocol != htons(ETH_P_IP))
1595                 return -EINVAL;
1596
1597         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1598                 return -EINVAL;
1599
1600         if (ipv4_is_zeronet(saddr)) {
1601                 if (!ipv4_is_local_multicast(daddr))
1602                         return -EINVAL;
1603         } else {
1604                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1605                                           in_dev, itag);
1606                 if (err < 0)
1607                         return err;
1608         }
1609         return 0;
1610 }
1611
1612 /* called in rcu_read_lock() section */
1613 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614                              u8 tos, struct net_device *dev, int our)
1615 {
1616         struct in_device *in_dev = __in_dev_get_rcu(dev);
1617         unsigned int flags = RTCF_MULTICAST;
1618         struct rtable *rth;
1619         u32 itag = 0;
1620         int err;
1621
1622         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623         if (err)
1624                 return err;
1625
1626         if (our)
1627                 flags |= RTCF_LOCAL;
1628
1629         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1630                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1631         if (!rth)
1632                 return -ENOBUFS;
1633
1634 #ifdef CONFIG_IP_ROUTE_CLASSID
1635         rth->dst.tclassid = itag;
1636 #endif
1637         rth->dst.output = ip_rt_bug;
1638         rth->rt_is_input= 1;
1639
1640 #ifdef CONFIG_IP_MROUTE
1641         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1642                 rth->dst.input = ip_mr_input;
1643 #endif
1644         RT_CACHE_STAT_INC(in_slow_mc);
1645
1646         skb_dst_set(skb, &rth->dst);
1647         return 0;
1648 }
1649
1650
1651 static void ip_handle_martian_source(struct net_device *dev,
1652                                      struct in_device *in_dev,
1653                                      struct sk_buff *skb,
1654                                      __be32 daddr,
1655                                      __be32 saddr)
1656 {
1657         RT_CACHE_STAT_INC(in_martian_src);
1658 #ifdef CONFIG_IP_ROUTE_VERBOSE
1659         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660                 /*
1661                  *      RFC1812 recommendation, if source is martian,
1662                  *      the only hint is MAC header.
1663                  */
1664                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1665                         &daddr, &saddr, dev->name);
1666                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1667                         print_hex_dump(KERN_WARNING, "ll header: ",
1668                                        DUMP_PREFIX_OFFSET, 16, 1,
1669                                        skb_mac_header(skb),
1670                                        dev->hard_header_len, true);
1671                 }
1672         }
1673 #endif
1674 }
1675
1676 /* called in rcu_read_lock() section */
1677 static int __mkroute_input(struct sk_buff *skb,
1678                            const struct fib_result *res,
1679                            struct in_device *in_dev,
1680                            __be32 daddr, __be32 saddr, u32 tos)
1681 {
1682         struct fib_nh_exception *fnhe;
1683         struct rtable *rth;
1684         int err;
1685         struct in_device *out_dev;
1686         bool do_cache;
1687         u32 itag = 0;
1688
1689         /* get a working reference to the output device */
1690         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1691         if (!out_dev) {
1692                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1693                 return -EINVAL;
1694         }
1695
1696         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1697                                   in_dev->dev, in_dev, &itag);
1698         if (err < 0) {
1699                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700                                          saddr);
1701
1702                 goto cleanup;
1703         }
1704
1705         do_cache = res->fi && !itag;
1706         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1707             skb->protocol == htons(ETH_P_IP) &&
1708             (IN_DEV_SHARED_MEDIA(out_dev) ||
1709              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1711
1712         if (skb->protocol != htons(ETH_P_IP)) {
1713                 /* Not IP (i.e. ARP). Do not create route, if it is
1714                  * invalid for proxy arp. DNAT routes are always valid.
1715                  *
1716                  * Proxy arp feature have been extended to allow, ARP
1717                  * replies back to the same interface, to support
1718                  * Private VLAN switch technologies. See arp.c.
1719                  */
1720                 if (out_dev == in_dev &&
1721                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722                         err = -EINVAL;
1723                         goto cleanup;
1724                 }
1725         }
1726
1727         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1728         if (do_cache) {
1729                 if (fnhe)
1730                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1731                 else
1732                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1733                 if (rt_cache_valid(rth)) {
1734                         skb_dst_set_noref(skb, &rth->dst);
1735                         goto out;
1736                 }
1737         }
1738
1739         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742         if (!rth) {
1743                 err = -ENOBUFS;
1744                 goto cleanup;
1745         }
1746
1747         rth->rt_is_input = 1;
1748         RT_CACHE_STAT_INC(in_slow_tot);
1749
1750         rth->dst.input = ip_forward;
1751
1752         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753                        do_cache);
1754         lwtunnel_set_redirect(&rth->dst);
1755         skb_dst_set(skb, &rth->dst);
1756 out:
1757         err = 0;
1758  cleanup:
1759         return err;
1760 }
1761
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1763 /* To make ICMP packets follow the right flow, the multipath hash is
1764  * calculated from the inner IP addresses.
1765  */
1766 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767                                  struct flow_keys *hash_keys)
1768 {
1769         const struct iphdr *outer_iph = ip_hdr(skb);
1770         const struct iphdr *key_iph = outer_iph;
1771         const struct iphdr *inner_iph;
1772         const struct icmphdr *icmph;
1773         struct iphdr _inner_iph;
1774         struct icmphdr _icmph;
1775
1776         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777                 goto out;
1778
1779         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1780                 goto out;
1781
1782         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783                                    &_icmph);
1784         if (!icmph)
1785                 goto out;
1786
1787         if (icmph->type != ICMP_DEST_UNREACH &&
1788             icmph->type != ICMP_REDIRECT &&
1789             icmph->type != ICMP_TIME_EXCEEDED &&
1790             icmph->type != ICMP_PARAMETERPROB)
1791                 goto out;
1792
1793         inner_iph = skb_header_pointer(skb,
1794                                        outer_iph->ihl * 4 + sizeof(_icmph),
1795                                        sizeof(_inner_iph), &_inner_iph);
1796         if (!inner_iph)
1797                 goto out;
1798
1799         key_iph = inner_iph;
1800 out:
1801         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1803 }
1804
1805 /* if skb is set it will be used and fl4 can be NULL */
1806 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1807                        const struct sk_buff *skb, struct flow_keys *flkeys)
1808 {
1809         struct flow_keys hash_keys;
1810         u32 mhash;
1811
1812         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813         case 0:
1814                 memset(&hash_keys, 0, sizeof(hash_keys));
1815                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816                 if (skb) {
1817                         ip_multipath_l3_keys(skb, &hash_keys);
1818                 } else {
1819                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1820                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821                 }
1822                 break;
1823         case 1:
1824                 /* skb is currently provided only when forwarding */
1825                 if (skb) {
1826                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827                         struct flow_keys keys;
1828
1829                         /* short-circuit if we already have L4 hash present */
1830                         if (skb->l4_hash)
1831                                 return skb_get_hash_raw(skb) >> 1;
1832
1833                         memset(&hash_keys, 0, sizeof(hash_keys));
1834
1835                         if (!flkeys) {
1836                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1837                                 flkeys = &keys;
1838                         }
1839
1840                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843                         hash_keys.ports.src = flkeys->ports.src;
1844                         hash_keys.ports.dst = flkeys->ports.dst;
1845                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1846                 } else {
1847                         memset(&hash_keys, 0, sizeof(hash_keys));
1848                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1850                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851                         hash_keys.ports.src = fl4->fl4_sport;
1852                         hash_keys.ports.dst = fl4->fl4_dport;
1853                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854                 }
1855                 break;
1856         }
1857         mhash = flow_hash_from_keys(&hash_keys);
1858
1859         return mhash >> 1;
1860 }
1861 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
1863 static int ip_mkroute_input(struct sk_buff *skb,
1864                             struct fib_result *res,
1865                             struct in_device *in_dev,
1866                             __be32 daddr, __be32 saddr, u32 tos,
1867                             struct flow_keys *hkeys)
1868 {
1869 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1870         if (res->fi && res->fi->fib_nhs > 1) {
1871                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1872
1873                 fib_select_multipath(res, h);
1874         }
1875 #endif
1876
1877         /* create a routing cache entry */
1878         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1879 }
1880
1881 /*
1882  *      NOTE. We drop all the packets that has local source
1883  *      addresses, because every properly looped back packet
1884  *      must have correct destination already attached by output routine.
1885  *
1886  *      Such approach solves two big problems:
1887  *      1. Not simplex devices are handled properly.
1888  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1889  *      called with rcu_read_lock()
1890  */
1891
1892 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893                                u8 tos, struct net_device *dev,
1894                                struct fib_result *res)
1895 {
1896         struct in_device *in_dev = __in_dev_get_rcu(dev);
1897         struct flow_keys *flkeys = NULL, _flkeys;
1898         struct net    *net = dev_net(dev);
1899         struct ip_tunnel_info *tun_info;
1900         int             err = -EINVAL;
1901         unsigned int    flags = 0;
1902         u32             itag = 0;
1903         struct rtable   *rth;
1904         struct flowi4   fl4;
1905         bool do_cache;
1906
1907         /* IP on this device is disabled. */
1908
1909         if (!in_dev)
1910                 goto out;
1911
1912         /* Check for the most weird martians, which can be not detected
1913            by fib_lookup.
1914          */
1915
1916         tun_info = skb_tunnel_info(skb);
1917         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1918                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919         else
1920                 fl4.flowi4_tun_key.tun_id = 0;
1921         skb_dst_drop(skb);
1922
1923         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1924                 goto martian_source;
1925
1926         res->fi = NULL;
1927         res->table = NULL;
1928         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1929                 goto brd_input;
1930
1931         /* Accept zero addresses only to limited broadcast;
1932          * I even do not know to fix it or not. Waiting for complains :-)
1933          */
1934         if (ipv4_is_zeronet(saddr))
1935                 goto martian_source;
1936
1937         if (ipv4_is_zeronet(daddr))
1938                 goto martian_destination;
1939
1940         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941          * and call it once if daddr or/and saddr are loopback addresses
1942          */
1943         if (ipv4_is_loopback(daddr)) {
1944                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1945                         goto martian_destination;
1946         } else if (ipv4_is_loopback(saddr)) {
1947                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948                         goto martian_source;
1949         }
1950
1951         /*
1952          *      Now we are ready to route packet.
1953          */
1954         fl4.flowi4_oif = 0;
1955         fl4.flowi4_iif = dev->ifindex;
1956         fl4.flowi4_mark = skb->mark;
1957         fl4.flowi4_tos = tos;
1958         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1959         fl4.flowi4_flags = 0;
1960         fl4.daddr = daddr;
1961         fl4.saddr = saddr;
1962         fl4.flowi4_uid = sock_net_uid(net, NULL);
1963
1964         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1965                 flkeys = &_flkeys;
1966         } else {
1967                 fl4.flowi4_proto = 0;
1968                 fl4.fl4_sport = 0;
1969                 fl4.fl4_dport = 0;
1970         }
1971
1972         err = fib_lookup(net, &fl4, res, 0);
1973         if (err != 0) {
1974                 if (!IN_DEV_FORWARD(in_dev))
1975                         err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978
1979         if (res->type == RTN_BROADCAST)
1980                 goto brd_input;
1981
1982         if (res->type == RTN_LOCAL) {
1983                 err = fib_validate_source(skb, saddr, daddr, tos,
1984                                           0, dev, in_dev, &itag);
1985                 if (err < 0)
1986                         goto martian_source;
1987                 goto local_input;
1988         }
1989
1990         if (!IN_DEV_FORWARD(in_dev)) {
1991                 err = -EHOSTUNREACH;
1992                 goto no_route;
1993         }
1994         if (res->type != RTN_UNICAST)
1995                 goto martian_destination;
1996
1997         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998 out:    return err;
1999
2000 brd_input:
2001         if (skb->protocol != htons(ETH_P_IP))
2002                 goto e_inval;
2003
2004         if (!ipv4_is_zeronet(saddr)) {
2005                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006                                           in_dev, &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009         }
2010         flags |= RTCF_BROADCAST;
2011         res->type = RTN_BROADCAST;
2012         RT_CACHE_STAT_INC(in_brd);
2013
2014 local_input:
2015         do_cache = false;
2016         if (res->fi) {
2017                 if (!itag) {
2018                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019                         if (rt_cache_valid(rth)) {
2020                                 skb_dst_set_noref(skb, &rth->dst);
2021                                 err = 0;
2022                                 goto out;
2023                         }
2024                         do_cache = true;
2025                 }
2026         }
2027
2028         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029                            flags | RTCF_LOCAL, res->type,
2030                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031         if (!rth)
2032                 goto e_nobufs;
2033
2034         rth->dst.output= ip_rt_bug;
2035 #ifdef CONFIG_IP_ROUTE_CLASSID
2036         rth->dst.tclassid = itag;
2037 #endif
2038         rth->rt_is_input = 1;
2039
2040         RT_CACHE_STAT_INC(in_slow_tot);
2041         if (res->type == RTN_UNREACHABLE) {
2042                 rth->dst.input= ip_error;
2043                 rth->dst.error= -err;
2044                 rth->rt_flags   &= ~RTCF_LOCAL;
2045         }
2046
2047         if (do_cache) {
2048                 struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052                         WARN_ON(rth->dst.input == lwtunnel_input);
2053                         rth->dst.lwtstate->orig_input = rth->dst.input;
2054                         rth->dst.input = lwtunnel_input;
2055                 }
2056
2057                 if (unlikely(!rt_cache_route(nh, rth)))
2058                         rt_add_uncached_list(rth);
2059         }
2060         skb_dst_set(skb, &rth->dst);
2061         err = 0;
2062         goto out;
2063
2064 no_route:
2065         RT_CACHE_STAT_INC(in_no_route);
2066         res->type = RTN_UNREACHABLE;
2067         res->fi = NULL;
2068         res->table = NULL;
2069         goto local_input;
2070
2071         /*
2072          *      Do not cache martian addresses: they should be logged (RFC1812)
2073          */
2074 martian_destination:
2075         RT_CACHE_STAT_INC(in_martian_dst);
2076 #ifdef CONFIG_IP_ROUTE_VERBOSE
2077         if (IN_DEV_LOG_MARTIANS(in_dev))
2078                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079                                      &daddr, &saddr, dev->name);
2080 #endif
2081
2082 e_inval:
2083         err = -EINVAL;
2084         goto out;
2085
2086 e_nobufs:
2087         err = -ENOBUFS;
2088         goto out;
2089
2090 martian_source:
2091         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092         goto out;
2093 }
2094
2095 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096                          u8 tos, struct net_device *dev)
2097 {
2098         struct fib_result res;
2099         int err;
2100
2101         tos &= IPTOS_RT_MASK;
2102         rcu_read_lock();
2103         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104         rcu_read_unlock();
2105
2106         return err;
2107 }
2108 EXPORT_SYMBOL(ip_route_input_noref);
2109
2110 /* called with rcu_read_lock held */
2111 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112                        u8 tos, struct net_device *dev, struct fib_result *res)
2113 {
2114         /* Multicast recognition logic is moved from route cache to here.
2115            The problem was that too many Ethernet cards have broken/missing
2116            hardware multicast filters :-( As result the host on multicasting
2117            network acquires a lot of useless route cache entries, sort of
2118            SDR messages from all the world. Now we try to get rid of them.
2119            Really, provided software IP multicast filter is organized
2120            reasonably (at least, hashed), it does not result in a slowdown
2121            comparing with route cache reject entries.
2122            Note, that multicast routers are not affected, because
2123            route cache entry is created eventually.
2124          */
2125         if (ipv4_is_multicast(daddr)) {
2126                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2127                 int our = 0;
2128                 int err = -EINVAL;
2129
2130                 if (in_dev)
2131                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132                                               ip_hdr(skb)->protocol);
2133
2134                 /* check l3 master if no match yet */
2135                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136                         struct in_device *l3_in_dev;
2137
2138                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2139                         if (l3_in_dev)
2140                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141                                                       ip_hdr(skb)->protocol);
2142                 }
2143
2144                 if (our
2145 #ifdef CONFIG_IP_MROUTE
2146                         ||
2147                     (!ipv4_is_local_multicast(daddr) &&
2148                      IN_DEV_MFORWARD(in_dev))
2149 #endif
2150                    ) {
2151                         err = ip_route_input_mc(skb, daddr, saddr,
2152                                                 tos, dev, our);
2153                 }
2154                 return err;
2155         }
2156
2157         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2158 }
2159
2160 /* called with rcu_read_lock() */
2161 static struct rtable *__mkroute_output(const struct fib_result *res,
2162                                        const struct flowi4 *fl4, int orig_oif,
2163                                        struct net_device *dev_out,
2164                                        unsigned int flags)
2165 {
2166         struct fib_info *fi = res->fi;
2167         struct fib_nh_exception *fnhe;
2168         struct in_device *in_dev;
2169         u16 type = res->type;
2170         struct rtable *rth;
2171         bool do_cache;
2172
2173         in_dev = __in_dev_get_rcu(dev_out);
2174         if (!in_dev)
2175                 return ERR_PTR(-EINVAL);
2176
2177         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178                 if (ipv4_is_loopback(fl4->saddr) &&
2179                     !(dev_out->flags & IFF_LOOPBACK) &&
2180                     !netif_is_l3_master(dev_out))
2181                         return ERR_PTR(-EINVAL);
2182
2183         if (ipv4_is_lbcast(fl4->daddr))
2184                 type = RTN_BROADCAST;
2185         else if (ipv4_is_multicast(fl4->daddr))
2186                 type = RTN_MULTICAST;
2187         else if (ipv4_is_zeronet(fl4->daddr))
2188                 return ERR_PTR(-EINVAL);
2189
2190         if (dev_out->flags & IFF_LOOPBACK)
2191                 flags |= RTCF_LOCAL;
2192
2193         do_cache = true;
2194         if (type == RTN_BROADCAST) {
2195                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196                 fi = NULL;
2197         } else if (type == RTN_MULTICAST) {
2198                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200                                      fl4->flowi4_proto))
2201                         flags &= ~RTCF_LOCAL;
2202                 else
2203                         do_cache = false;
2204                 /* If multicast route do not exist use
2205                  * default one, but do not gateway in this case.
2206                  * Yes, it is hack.
2207                  */
2208                 if (fi && res->prefixlen < 4)
2209                         fi = NULL;
2210         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211                    (orig_oif != dev_out->ifindex)) {
2212                 /* For local routes that require a particular output interface
2213                  * we do not want to cache the result.  Caching the result
2214                  * causes incorrect behaviour when there are multiple source
2215                  * addresses on the interface, the end result being that if the
2216                  * intended recipient is waiting on that interface for the
2217                  * packet he won't receive it because it will be delivered on
2218                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219                  * be set to the loopback interface as well.
2220                  */
2221                 do_cache = false;
2222         }
2223
2224         fnhe = NULL;
2225         do_cache &= fi != NULL;
2226         if (fi) {
2227                 struct rtable __rcu **prth;
2228                 struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230                 fnhe = find_exception(nh, fl4->daddr);
2231                 if (!do_cache)
2232                         goto add;
2233                 if (fnhe) {
2234                         prth = &fnhe->fnhe_rth_output;
2235                 } else {
2236                         if (unlikely(fl4->flowi4_flags &
2237                                      FLOWI_FLAG_KNOWN_NH &&
2238                                      !(nh->nh_gw &&
2239                                        nh->nh_scope == RT_SCOPE_LINK))) {
2240                                 do_cache = false;
2241                                 goto add;
2242                         }
2243                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2244                 }
2245                 rth = rcu_dereference(*prth);
2246                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2247                         return rth;
2248         }
2249
2250 add:
2251         rth = rt_dst_alloc(dev_out, flags, type,
2252                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2254                            do_cache);
2255         if (!rth)
2256                 return ERR_PTR(-ENOBUFS);
2257
2258         rth->rt_iif = orig_oif;
2259
2260         RT_CACHE_STAT_INC(out_slow_tot);
2261
2262         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2263                 if (flags & RTCF_LOCAL &&
2264                     !(dev_out->flags & IFF_LOOPBACK)) {
2265                         rth->dst.output = ip_mc_output;
2266                         RT_CACHE_STAT_INC(out_slow_mc);
2267                 }
2268 #ifdef CONFIG_IP_MROUTE
2269                 if (type == RTN_MULTICAST) {
2270                         if (IN_DEV_MFORWARD(in_dev) &&
2271                             !ipv4_is_local_multicast(fl4->daddr)) {
2272                                 rth->dst.input = ip_mr_input;
2273                                 rth->dst.output = ip_mc_output;
2274                         }
2275                 }
2276 #endif
2277         }
2278
2279         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280         lwtunnel_set_redirect(&rth->dst);
2281
2282         return rth;
2283 }
2284
2285 /*
2286  * Major route resolver routine.
2287  */
2288
2289 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290                                         const struct sk_buff *skb)
2291 {
2292         __u8 tos = RT_FL_TOS(fl4);
2293         struct fib_result res = {
2294                 .type           = RTN_UNSPEC,
2295                 .fi             = NULL,
2296                 .table          = NULL,
2297                 .tclassid       = 0,
2298         };
2299         struct rtable *rth;
2300
2301         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306         rcu_read_lock();
2307         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308         rcu_read_unlock();
2309
2310         return rth;
2311 }
2312 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315                                             struct fib_result *res,
2316                                             const struct sk_buff *skb)
2317 {
2318         struct net_device *dev_out = NULL;
2319         int orig_oif = fl4->flowi4_oif;
2320         unsigned int flags = 0;
2321         struct rtable *rth;
2322         int err = -ENETUNREACH;
2323
2324         if (fl4->saddr) {
2325                 rth = ERR_PTR(-EINVAL);
2326                 if (ipv4_is_multicast(fl4->saddr) ||
2327                     ipv4_is_lbcast(fl4->saddr) ||
2328                     ipv4_is_zeronet(fl4->saddr))
2329                         goto out;
2330
2331                 /* I removed check for oif == dev_out->oif here.
2332                    It was wrong for two reasons:
2333                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334                       is assigned to multiple interfaces.
2335                    2. Moreover, we are allowed to send packets with saddr
2336                       of another iface. --ANK
2337                  */
2338
2339                 if (fl4->flowi4_oif == 0 &&
2340                     (ipv4_is_multicast(fl4->daddr) ||
2341                      ipv4_is_lbcast(fl4->daddr))) {
2342                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2344                         if (!dev_out)
2345                                 goto out;
2346
2347                         /* Special hack: user can direct multicasts
2348                            and limited broadcast via necessary interface
2349                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350                            This hack is not just for fun, it allows
2351                            vic,vat and friends to work.
2352                            They bind socket to loopback, set ttl to zero
2353                            and expect that it will work.
2354                            From the viewpoint of routing cache they are broken,
2355                            because we are not allowed to build multicast path
2356                            with loopback source addr (look, routing cache
2357                            cannot know, that ttl is zero, so that packet
2358                            will not leave this host and route is valid).
2359                            Luckily, this hack is good workaround.
2360                          */
2361
2362                         fl4->flowi4_oif = dev_out->ifindex;
2363                         goto make_route;
2364                 }
2365
2366                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368                         if (!__ip_dev_find(net, fl4->saddr, false))
2369                                 goto out;
2370                 }
2371         }
2372
2373
2374         if (fl4->flowi4_oif) {
2375                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376                 rth = ERR_PTR(-ENODEV);
2377                 if (!dev_out)
2378                         goto out;
2379
2380                 /* RACE: Check return value of inet_select_addr instead. */
2381                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382                         rth = ERR_PTR(-ENETUNREACH);
2383                         goto out;
2384                 }
2385                 if (ipv4_is_local_multicast(fl4->daddr) ||
2386                     ipv4_is_lbcast(fl4->daddr) ||
2387                     fl4->flowi4_proto == IPPROTO_IGMP) {
2388                         if (!fl4->saddr)
2389                                 fl4->saddr = inet_select_addr(dev_out, 0,
2390                                                               RT_SCOPE_LINK);
2391                         goto make_route;
2392                 }
2393                 if (!fl4->saddr) {
2394                         if (ipv4_is_multicast(fl4->daddr))
2395                                 fl4->saddr = inet_select_addr(dev_out, 0,
2396                                                               fl4->flowi4_scope);
2397                         else if (!fl4->daddr)
2398                                 fl4->saddr = inet_select_addr(dev_out, 0,
2399                                                               RT_SCOPE_HOST);
2400                 }
2401         }
2402
2403         if (!fl4->daddr) {
2404                 fl4->daddr = fl4->saddr;
2405                 if (!fl4->daddr)
2406                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407                 dev_out = net->loopback_dev;
2408                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409                 res->type = RTN_LOCAL;
2410                 flags |= RTCF_LOCAL;
2411                 goto make_route;
2412         }
2413
2414         err = fib_lookup(net, fl4, res, 0);
2415         if (err) {
2416                 res->fi = NULL;
2417                 res->table = NULL;
2418                 if (fl4->flowi4_oif &&
2419                     (ipv4_is_multicast(fl4->daddr) ||
2420                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421                         /* Apparently, routing tables are wrong. Assume,
2422                            that the destination is on link.
2423
2424                            WHY? DW.
2425                            Because we are allowed to send to iface
2426                            even if it has NO routes and NO assigned
2427                            addresses. When oif is specified, routing
2428                            tables are looked up with only one purpose:
2429                            to catch if destination is gatewayed, rather than
2430                            direct. Moreover, if MSG_DONTROUTE is set,
2431                            we send packet, ignoring both routing tables
2432                            and ifaddr state. --ANK
2433
2434
2435                            We could make it even if oif is unknown,
2436                            likely IPv6, but we do not.
2437                          */
2438
2439                         if (fl4->saddr == 0)
2440                                 fl4->saddr = inet_select_addr(dev_out, 0,
2441                                                               RT_SCOPE_LINK);
2442                         res->type = RTN_UNICAST;
2443                         goto make_route;
2444                 }
2445                 rth = ERR_PTR(err);
2446                 goto out;
2447         }
2448
2449         if (res->type == RTN_LOCAL) {
2450                 if (!fl4->saddr) {
2451                         if (res->fi->fib_prefsrc)
2452                                 fl4->saddr = res->fi->fib_prefsrc;
2453                         else
2454                                 fl4->saddr = fl4->daddr;
2455                 }
2456
2457                 /* L3 master device is the loopback for that domain */
2458                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459                         net->loopback_dev;
2460
2461                 /* make sure orig_oif points to fib result device even
2462                  * though packet rx/tx happens over loopback or l3mdev
2463                  */
2464                 orig_oif = FIB_RES_OIF(*res);
2465
2466                 fl4->flowi4_oif = dev_out->ifindex;
2467                 flags |= RTCF_LOCAL;
2468                 goto make_route;
2469         }
2470
2471         fib_select_path(net, res, fl4, skb);
2472
2473         dev_out = FIB_RES_DEV(*res);
2474         fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477 make_route:
2478         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2479
2480 out:
2481         return rth;
2482 }
2483
2484 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485 {
2486         return NULL;
2487 }
2488
2489 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490 {
2491         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493         return mtu ? : dst->dev->mtu;
2494 }
2495
2496 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497                                           struct sk_buff *skb, u32 mtu)
2498 {
2499 }
2500
2501 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502                                        struct sk_buff *skb)
2503 {
2504 }
2505
2506 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507                                           unsigned long old)
2508 {
2509         return NULL;
2510 }
2511
2512 static struct dst_ops ipv4_dst_blackhole_ops = {
2513         .family                 =       AF_INET,
2514         .check                  =       ipv4_blackhole_dst_check,
2515         .mtu                    =       ipv4_blackhole_mtu,
2516         .default_advmss         =       ipv4_default_advmss,
2517         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2518         .redirect               =       ipv4_rt_blackhole_redirect,
2519         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2520         .neigh_lookup           =       ipv4_neigh_lookup,
2521 };
2522
2523 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524 {
2525         struct rtable *ort = (struct rtable *) dst_orig;
2526         struct rtable *rt;
2527
2528         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529         if (rt) {
2530                 struct dst_entry *new = &rt->dst;
2531
2532                 new->__use = 1;
2533                 new->input = dst_discard;
2534                 new->output = dst_discard_out;
2535
2536                 new->dev = net->loopback_dev;
2537                 if (new->dev)
2538                         dev_hold(new->dev);
2539
2540                 rt->rt_is_input = ort->rt_is_input;
2541                 rt->rt_iif = ort->rt_iif;
2542                 rt->rt_pmtu = ort->rt_pmtu;
2543                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545                 rt->rt_genid = rt_genid_ipv4(net);
2546                 rt->rt_flags = ort->rt_flags;
2547                 rt->rt_type = ort->rt_type;
2548                 rt->rt_gateway = ort->rt_gateway;
2549                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2550
2551                 INIT_LIST_HEAD(&rt->rt_uncached);
2552         }
2553
2554         dst_release(dst_orig);
2555
2556         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557 }
2558
2559 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560                                     const struct sock *sk)
2561 {
2562         struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564         if (IS_ERR(rt))
2565                 return rt;
2566
2567         if (flp4->flowi4_proto)
2568                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569                                                         flowi4_to_flowi(flp4),
2570                                                         sk, 0);
2571
2572         return rt;
2573 }
2574 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
2576 /* called with rcu_read_lock held */
2577 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579                         u32 seq)
2580 {
2581         struct rtable *rt = skb_rtable(skb);
2582         struct rtmsg *r;
2583         struct nlmsghdr *nlh;
2584         unsigned long expires = 0;
2585         u32 error;
2586         u32 metrics[RTAX_MAX];
2587
2588         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589         if (!nlh)
2590                 return -EMSGSIZE;
2591
2592         r = nlmsg_data(nlh);
2593         r->rtm_family    = AF_INET;
2594         r->rtm_dst_len  = 32;
2595         r->rtm_src_len  = 0;
2596         r->rtm_tos      = fl4->flowi4_tos;
2597         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598         if (nla_put_u32(skb, RTA_TABLE, table_id))
2599                 goto nla_put_failure;
2600         r->rtm_type     = rt->rt_type;
2601         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2602         r->rtm_protocol = RTPROT_UNSPEC;
2603         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604         if (rt->rt_flags & RTCF_NOTIFY)
2605                 r->rtm_flags |= RTM_F_NOTIFY;
2606         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607                 r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609         if (nla_put_in_addr(skb, RTA_DST, dst))
2610                 goto nla_put_failure;
2611         if (src) {
2612                 r->rtm_src_len = 32;
2613                 if (nla_put_in_addr(skb, RTA_SRC, src))
2614                         goto nla_put_failure;
2615         }
2616         if (rt->dst.dev &&
2617             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618                 goto nla_put_failure;
2619 #ifdef CONFIG_IP_ROUTE_CLASSID
2620         if (rt->dst.tclassid &&
2621             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622                 goto nla_put_failure;
2623 #endif
2624         if (!rt_is_input_route(rt) &&
2625             fl4->saddr != src) {
2626                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627                         goto nla_put_failure;
2628         }
2629         if (rt->rt_uses_gateway &&
2630             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631                 goto nla_put_failure;
2632
2633         expires = rt->dst.expires;
2634         if (expires) {
2635                 unsigned long now = jiffies;
2636
2637                 if (time_before(now, expires))
2638                         expires -= now;
2639                 else
2640                         expires = 0;
2641         }
2642
2643         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644         if (rt->rt_pmtu && expires)
2645                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646         if (rt->rt_mtu_locked && expires)
2647                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648         if (rtnetlink_put_metrics(skb, metrics) < 0)
2649                 goto nla_put_failure;
2650
2651         if (fl4->flowi4_mark &&
2652             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653                 goto nla_put_failure;
2654
2655         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656             nla_put_u32(skb, RTA_UID,
2657                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658                 goto nla_put_failure;
2659
2660         error = rt->dst.error;
2661
2662         if (rt_is_input_route(rt)) {
2663 #ifdef CONFIG_IP_MROUTE
2664                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666                         int err = ipmr_get_route(net, skb,
2667                                                  fl4->saddr, fl4->daddr,
2668                                                  r, portid);
2669
2670                         if (err <= 0) {
2671                                 if (err == 0)
2672                                         return 0;
2673                                 goto nla_put_failure;
2674                         }
2675                 } else
2676 #endif
2677                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678                                 goto nla_put_failure;
2679         }
2680
2681         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2682                 goto nla_put_failure;
2683
2684         nlmsg_end(skb, nlh);
2685         return 0;
2686
2687 nla_put_failure:
2688         nlmsg_cancel(skb, nlh);
2689         return -EMSGSIZE;
2690 }
2691
2692 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693                              struct netlink_ext_ack *extack)
2694 {
2695         struct net *net = sock_net(in_skb->sk);
2696         struct rtmsg *rtm;
2697         struct nlattr *tb[RTA_MAX+1];
2698         struct fib_result res = {};
2699         struct rtable *rt = NULL;
2700         struct flowi4 fl4;
2701         __be32 dst = 0;
2702         __be32 src = 0;
2703         u32 iif;
2704         int err;
2705         int mark;
2706         struct sk_buff *skb;
2707         u32 table_id = RT_TABLE_MAIN;
2708         kuid_t uid;
2709
2710         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711                           extack);
2712         if (err < 0)
2713                 goto errout;
2714
2715         rtm = nlmsg_data(nlh);
2716
2717         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718         if (!skb) {
2719                 err = -ENOBUFS;
2720                 goto errout;
2721         }
2722
2723         /* Reserve room for dummy headers, this skb can pass
2724            through good chunk of routing engine.
2725          */
2726         skb_reset_mac_header(skb);
2727         skb_reset_network_header(skb);
2728
2729         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2731         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733         if (tb[RTA_UID])
2734                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735         else
2736                 uid = (iif ? INVALID_UID : current_uid());
2737
2738         /* Bugfix: need to give ip_route_input enough of an IP header to
2739          * not gag.
2740          */
2741         ip_hdr(skb)->protocol = IPPROTO_UDP;
2742         ip_hdr(skb)->saddr = src;
2743         ip_hdr(skb)->daddr = dst;
2744
2745         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2746
2747         memset(&fl4, 0, sizeof(fl4));
2748         fl4.daddr = dst;
2749         fl4.saddr = src;
2750         fl4.flowi4_tos = rtm->rtm_tos;
2751         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752         fl4.flowi4_mark = mark;
2753         fl4.flowi4_uid = uid;
2754
2755         rcu_read_lock();
2756
2757         if (iif) {
2758                 struct net_device *dev;
2759
2760                 dev = dev_get_by_index_rcu(net, iif);
2761                 if (!dev) {
2762                         err = -ENODEV;
2763                         goto errout_free;
2764                 }
2765
2766                 skb->protocol   = htons(ETH_P_IP);
2767                 skb->dev        = dev;
2768                 skb->mark       = mark;
2769                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770                                          dev, &res);
2771
2772                 rt = skb_rtable(skb);
2773                 if (err == 0 && rt->dst.error)
2774                         err = -rt->dst.error;
2775         } else {
2776                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2777                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2778                 err = 0;
2779                 if (IS_ERR(rt))
2780                         err = PTR_ERR(rt);
2781                 else
2782                         skb_dst_set(skb, &rt->dst);
2783         }
2784
2785         if (err)
2786                 goto errout_free;
2787
2788         if (rtm->rtm_flags & RTM_F_NOTIFY)
2789                 rt->rt_flags |= RTCF_NOTIFY;
2790
2791         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792                 table_id = res.table ? res.table->tb_id : 0;
2793
2794         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2795                 if (!res.fi) {
2796                         err = fib_props[res.type].error;
2797                         if (!err)
2798                                 err = -EHOSTUNREACH;
2799                         goto errout_free;
2800                 }
2801                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803                                     rt->rt_type, res.prefix, res.prefixlen,
2804                                     fl4.flowi4_tos, res.fi, 0);
2805         } else {
2806                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2808         }
2809         if (err < 0)
2810                 goto errout_free;
2811
2812         rcu_read_unlock();
2813
2814         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815 errout:
2816         return err;
2817
2818 errout_free:
2819         rcu_read_unlock();
2820         kfree_skb(skb);
2821         goto errout;
2822 }
2823
2824 void ip_rt_multicast_event(struct in_device *in_dev)
2825 {
2826         rt_cache_flush(dev_net(in_dev->dev));
2827 }
2828
2829 #ifdef CONFIG_SYSCTL
2830 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2832 static int ip_rt_gc_elasticity __read_mostly    = 8;
2833 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2834
2835 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836                                         void __user *buffer,
2837                                         size_t *lenp, loff_t *ppos)
2838 {
2839         struct net *net = (struct net *)__ctl->extra1;
2840
2841         if (write) {
2842                 rt_cache_flush(net);
2843                 fnhe_genid_bump(net);
2844                 return 0;
2845         }
2846
2847         return -EINVAL;
2848 }
2849
2850 static struct ctl_table ipv4_route_table[] = {
2851         {
2852                 .procname       = "gc_thresh",
2853                 .data           = &ipv4_dst_ops.gc_thresh,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = proc_dointvec,
2857         },
2858         {
2859                 .procname       = "max_size",
2860                 .data           = &ip_rt_max_size,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec,
2864         },
2865         {
2866                 /*  Deprecated. Use gc_min_interval_ms */
2867
2868                 .procname       = "gc_min_interval",
2869                 .data           = &ip_rt_gc_min_interval,
2870                 .maxlen         = sizeof(int),
2871                 .mode           = 0644,
2872                 .proc_handler   = proc_dointvec_jiffies,
2873         },
2874         {
2875                 .procname       = "gc_min_interval_ms",
2876                 .data           = &ip_rt_gc_min_interval,
2877                 .maxlen         = sizeof(int),
2878                 .mode           = 0644,
2879                 .proc_handler   = proc_dointvec_ms_jiffies,
2880         },
2881         {
2882                 .procname       = "gc_timeout",
2883                 .data           = &ip_rt_gc_timeout,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = proc_dointvec_jiffies,
2887         },
2888         {
2889                 .procname       = "gc_interval",
2890                 .data           = &ip_rt_gc_interval,
2891                 .maxlen         = sizeof(int),
2892                 .mode           = 0644,
2893                 .proc_handler   = proc_dointvec_jiffies,
2894         },
2895         {
2896                 .procname       = "redirect_load",
2897                 .data           = &ip_rt_redirect_load,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = proc_dointvec,
2901         },
2902         {
2903                 .procname       = "redirect_number",
2904                 .data           = &ip_rt_redirect_number,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = proc_dointvec,
2908         },
2909         {
2910                 .procname       = "redirect_silence",
2911                 .data           = &ip_rt_redirect_silence,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = proc_dointvec,
2915         },
2916         {
2917                 .procname       = "error_cost",
2918                 .data           = &ip_rt_error_cost,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = proc_dointvec,
2922         },
2923         {
2924                 .procname       = "error_burst",
2925                 .data           = &ip_rt_error_burst,
2926                 .maxlen         = sizeof(int),
2927                 .mode           = 0644,
2928                 .proc_handler   = proc_dointvec,
2929         },
2930         {
2931                 .procname       = "gc_elasticity",
2932                 .data           = &ip_rt_gc_elasticity,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = proc_dointvec,
2936         },
2937         {
2938                 .procname       = "mtu_expires",
2939                 .data           = &ip_rt_mtu_expires,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = proc_dointvec_jiffies,
2943         },
2944         {
2945                 .procname       = "min_pmtu",
2946                 .data           = &ip_rt_min_pmtu,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = proc_dointvec_minmax,
2950                 .extra1         = &ip_min_valid_pmtu,
2951         },
2952         {
2953                 .procname       = "min_adv_mss",
2954                 .data           = &ip_rt_min_advmss,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec,
2958         },
2959         { }
2960 };
2961
2962 static struct ctl_table ipv4_route_flush_table[] = {
2963         {
2964                 .procname       = "flush",
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0200,
2967                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2968         },
2969         { },
2970 };
2971
2972 static __net_init int sysctl_route_net_init(struct net *net)
2973 {
2974         struct ctl_table *tbl;
2975
2976         tbl = ipv4_route_flush_table;
2977         if (!net_eq(net, &init_net)) {
2978                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2979                 if (!tbl)
2980                         goto err_dup;
2981
2982                 /* Don't export sysctls to unprivileged users */
2983                 if (net->user_ns != &init_user_ns)
2984                         tbl[0].procname = NULL;
2985         }
2986         tbl[0].extra1 = net;
2987
2988         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2989         if (!net->ipv4.route_hdr)
2990                 goto err_reg;
2991         return 0;
2992
2993 err_reg:
2994         if (tbl != ipv4_route_flush_table)
2995                 kfree(tbl);
2996 err_dup:
2997         return -ENOMEM;
2998 }
2999
3000 static __net_exit void sysctl_route_net_exit(struct net *net)
3001 {
3002         struct ctl_table *tbl;
3003
3004         tbl = net->ipv4.route_hdr->ctl_table_arg;
3005         unregister_net_sysctl_table(net->ipv4.route_hdr);
3006         BUG_ON(tbl == ipv4_route_flush_table);
3007         kfree(tbl);
3008 }
3009
3010 static __net_initdata struct pernet_operations sysctl_route_ops = {
3011         .init = sysctl_route_net_init,
3012         .exit = sysctl_route_net_exit,
3013 };
3014 #endif
3015
3016 static __net_init int rt_genid_init(struct net *net)
3017 {
3018         atomic_set(&net->ipv4.rt_genid, 0);
3019         atomic_set(&net->fnhe_genid, 0);
3020         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3021         return 0;
3022 }
3023
3024 static __net_initdata struct pernet_operations rt_genid_ops = {
3025         .init = rt_genid_init,
3026 };
3027
3028 static int __net_init ipv4_inetpeer_init(struct net *net)
3029 {
3030         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032         if (!bp)
3033                 return -ENOMEM;
3034         inet_peer_base_init(bp);
3035         net->ipv4.peers = bp;
3036         return 0;
3037 }
3038
3039 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040 {
3041         struct inet_peer_base *bp = net->ipv4.peers;
3042
3043         net->ipv4.peers = NULL;
3044         inetpeer_invalidate_tree(bp);
3045         kfree(bp);
3046 }
3047
3048 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049         .init   =       ipv4_inetpeer_init,
3050         .exit   =       ipv4_inetpeer_exit,
3051 };
3052
3053 #ifdef CONFIG_IP_ROUTE_CLASSID
3054 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055 #endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057 int __init ip_rt_init(void)
3058 {
3059         int cpu;
3060
3061         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062         if (!ip_idents)
3063                 panic("IP: failed to allocate ip_idents\n");
3064
3065         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3066
3067         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068         if (!ip_tstamps)
3069                 panic("IP: failed to allocate ip_tstamps\n");
3070
3071         for_each_possible_cpu(cpu) {
3072                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3073
3074                 INIT_LIST_HEAD(&ul->head);
3075                 spin_lock_init(&ul->lock);
3076         }
3077 #ifdef CONFIG_IP_ROUTE_CLASSID
3078         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079         if (!ip_rt_acct)
3080                 panic("IP: failed to allocate ip_rt_acct\n");
3081 #endif
3082
3083         ipv4_dst_ops.kmem_cachep =
3084                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089         if (dst_entries_init(&ipv4_dst_ops) < 0)
3090                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095         ipv4_dst_ops.gc_thresh = ~0;
3096         ip_rt_max_size = INT_MAX;
3097
3098         devinet_init();
3099         ip_fib_init();
3100
3101         if (ip_rt_proc_init())
3102                 pr_err("Unable to create route proc files\n");
3103 #ifdef CONFIG_XFRM
3104         xfrm_init();
3105         xfrm4_init();
3106 #endif
3107         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108                       RTNL_FLAG_DOIT_UNLOCKED);
3109
3110 #ifdef CONFIG_SYSCTL
3111         register_pernet_subsys(&sysctl_route_ops);
3112 #endif
3113         register_pernet_subsys(&rt_genid_ops);
3114         register_pernet_subsys(&ipv4_inetpeer_ops);
3115         return 0;
3116 }
3117
3118 #ifdef CONFIG_SYSCTL
3119 /*
3120  * We really need to sanitize the damn ipv4 init order, then all
3121  * this nonsense will go away.
3122  */
3123 void __init ip_static_sysctl_init(void)
3124 {
3125         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126 }
3127 #endif