net/ipv4/route.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              ROUTE - implementation of the IP router.
   8  *
   9  * Authors:     Ross Biro
  10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Verify area fixes.
  17  *              Alan Cox        :       cli() protects routing changes
  18  *              Rui Oliveira    :       ICMP routing table updates
  19  *              (rco@di.uminho.pt)      Routing table insertion and update
  20  *              Linus Torvalds  :       Rewrote bits to be sensible
  21  *              Alan Cox        :       Added BSD route gw semantics
  22  *              Alan Cox        :       Super /proc >4K
  23  *              Alan Cox        :       MTU in route table
  24  *              Alan Cox        :       MSS actually. Also added the window
  25  *                                      clamper.
  26  *              Sam Lantinga    :       Fixed route matching in rt_del()
  27  *              Alan Cox        :       Routing cache support.
  28  *              Alan Cox        :       Removed compatibility cruft.
  29  *              Alan Cox        :       RTF_REJECT support.
  30  *              Alan Cox        :       TCP irtt support.
  31  *              Jonathan Naylor :       Added Metric support.
  32  *      Miquel van Smoorenburg  :       BSD API fixes.
  33  *      Miquel van Smoorenburg  :       Metrics.
  34  *              Alan Cox        :       Use __u32 properly
  35  *              Alan Cox        :       Aligned routing errors more closely with BSD
  36  *                                      our system is still very different.
  37  *              Alan Cox        :       Faster /proc handling
  38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39  *                                      routing caches and better behaviour.
  40  *
  41  *              Olaf Erb        :       irtt wasn't being copied right.
  42  *              Bjorn Ekwall    :       Kerneld route support.
  43  *              Alan Cox        :       Multicast fixed (I hope)
  44  *              Pavel Krauz     :       Limited broadcast fixed
  45  *              Mike McLagan    :       Routing by source
  46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47  *                                      route.c and rewritten from scratch.
  48  *              Andi Kleen      :       Load-limit warning messages.
  49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53  *              Marc Boucher    :       routing by fwmark
  54  *      Robert Olsson           :       Added rt_cache statistics
  55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59  */
  60
  61 #define pr_fmt(fmt) "IPv4: " fmt
  62
  63 #include <linux/module.h>
  64 #include <linux/uaccess.h>
  65 #include <linux/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/mm.h>
  69 #include <linux/string.h>
  70 #include <linux/socket.h>
  71 #include <linux/sockios.h>
  72 #include <linux/errno.h>
  73 #include <linux/in.h>
  74 #include <linux/inet.h>
  75 #include <linux/netdevice.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/init.h>
  78 #include <linux/skbuff.h>
  79 #include <linux/inetdevice.h>
  80 #include <linux/igmp.h>
  81 #include <linux/pkt_sched.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/random.h>
  85 #include <linux/rcupdate.h>
  86 #include <linux/times.h>
  87 #include <linux/slab.h>
  88 #include <linux/jhash.h>
  89 #include <net/dst.h>
  90 #include <net/dst_metadata.h>
  91 #include <net/net_namespace.h>
  92 #include <net/protocol.h>
  93 #include <net/ip.h>
  94 #include <net/route.h>
  95 #include <net/inetpeer.h>
  96 #include <net/sock.h>
  97 #include <net/ip_fib.h>
  98 #include <net/nexthop.h>
  99 #include <net/arp.h>
 100 #include <net/tcp.h>
 101 #include <net/icmp.h>
 102 #include <net/xfrm.h>
 103 #include <net/lwtunnel.h>
 104 #include <net/netevent.h>
 105 #include <net/rtnetlink.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #endif
 109 #include <net/secure_seq.h>
 110 #include <net/ip_tunnels.h>
 111 #include <net/l3mdev.h>
 112
 113 #include "fib_lookup.h"
 114
 115 #define RT_FL_TOS(oldflp4) \
 116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_redirect_number __read_mostly  = 9;
 122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 124 static int ip_rt_error_cost __read_mostly       = HZ;
 125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 128 static int ip_rt_min_advmss __read_mostly       = 256;
 129
 130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 131
 132 /*
 133  *      Interface to generic destination cache.
 134  */
 135
 136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140 static void              ipv4_link_failure(struct sk_buff *skb);
 141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                            struct sk_buff *skb, u32 mtu);
 143 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144                                         struct sk_buff *skb);
 145 static void             ipv4_dst_destroy(struct dst_entry *dst);
 146
 147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148 {
 149         WARN_ON(1);
 150         return NULL;
 151 }
 152
 153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154                                            struct sk_buff *skb,
 155                                            const void *daddr);
 156 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 157
 158 static struct dst_ops ipv4_dst_ops = {
 159         .family =               AF_INET,
 160         .check =                ipv4_dst_check,
 161         .default_advmss =       ipv4_default_advmss,
 162         .mtu =                  ipv4_mtu,
 163         .cow_metrics =          ipv4_cow_metrics,
 164         .destroy =              ipv4_dst_destroy,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .redirect =             ip_do_redirect,
 169         .local_out =            __ip_local_out,
 170         .neigh_lookup =         ipv4_neigh_lookup,
 171         .confirm_neigh =        ipv4_confirm_neigh,
 172 };
 173
 174 #define ECN_OR_COST(class)      TC_PRIO_##class
 175
 176 const __u8 ip_tos2prio[16] = {
 177         TC_PRIO_BESTEFFORT,
 178         ECN_OR_COST(BESTEFFORT),
 179         TC_PRIO_BESTEFFORT,
 180         ECN_OR_COST(BESTEFFORT),
 181         TC_PRIO_BULK,
 182         ECN_OR_COST(BULK),
 183         TC_PRIO_BULK,
 184         ECN_OR_COST(BULK),
 185         TC_PRIO_INTERACTIVE,
 186         ECN_OR_COST(INTERACTIVE),
 187         TC_PRIO_INTERACTIVE,
 188         ECN_OR_COST(INTERACTIVE),
 189         TC_PRIO_INTERACTIVE_BULK,
 190         ECN_OR_COST(INTERACTIVE_BULK),
 191         TC_PRIO_INTERACTIVE_BULK,
 192         ECN_OR_COST(INTERACTIVE_BULK)
 193 };
 194 EXPORT_SYMBOL(ip_tos2prio);
 195
 196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 197 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 198
 199 #ifdef CONFIG_PROC_FS
 200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 201 {
 202         if (*pos)
 203                 return NULL;
 204         return SEQ_START_TOKEN;
 205 }
 206
 207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 208 {
 209         ++*pos;
 210         return NULL;
 211 }
 212
 213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 214 {
 215 }
 216
 217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 218 {
 219         if (v == SEQ_START_TOKEN)
 220                 seq_printf(seq, "%-127s\n",
 221                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 222                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 223                            "HHUptod\tSpecDst");
 224         return 0;
 225 }
 226
 227 static const struct seq_operations rt_cache_seq_ops = {
 228         .start  = rt_cache_seq_start,
 229         .next   = rt_cache_seq_next,
 230         .stop   = rt_cache_seq_stop,
 231         .show   = rt_cache_seq_show,
 232 };
 233
 234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 235 {
 236         return seq_open(file, &rt_cache_seq_ops);
 237 }
 238
 239 static const struct file_operations rt_cache_seq_fops = {
 240         .open    = rt_cache_seq_open,
 241         .read    = seq_read,
 242         .llseek  = seq_lseek,
 243         .release = seq_release,
 244 };
 245
 246
 247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248 {
 249         int cpu;
 250
 251         if (*pos == 0)
 252                 return SEQ_START_TOKEN;
 253
 254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                 if (!cpu_possible(cpu))
 256                         continue;
 257                 *pos = cpu+1;
 258                 return &per_cpu(rt_cache_stat, cpu);
 259         }
 260         return NULL;
 261 }
 262
 263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264 {
 265         int cpu;
 266
 267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                 if (!cpu_possible(cpu))
 269                         continue;
 270                 *pos = cpu+1;
 271                 return &per_cpu(rt_cache_stat, cpu);
 272         }
 273         return NULL;
 274
 275 }
 276
 277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278 {
 279
 280 }
 281
 282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283 {
 284         struct rt_cache_stat *st = v;
 285
 286         if (v == SEQ_START_TOKEN) {
 287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                 return 0;
 289         }
 290
 291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                    dst_entries_get_slow(&ipv4_dst_ops),
 294                    0, /* st->in_hit */
 295                    st->in_slow_tot,
 296                    st->in_slow_mc,
 297                    st->in_no_route,
 298                    st->in_brd,
 299                    st->in_martian_dst,
 300                    st->in_martian_src,
 301
 302                    0, /* st->out_hit */
 303                    st->out_slow_tot,
 304                    st->out_slow_mc,
 305
 306                    0, /* st->gc_total */
 307                    0, /* st->gc_ignored */
 308                    0, /* st->gc_goal_miss */
 309                    0, /* st->gc_dst_overflow */
 310                    0, /* st->in_hlist_search */
 311                    0  /* st->out_hlist_search */
 312                 );
 313         return 0;
 314 }
 315
 316 static const struct seq_operations rt_cpu_seq_ops = {
 317         .start  = rt_cpu_seq_start,
 318         .next   = rt_cpu_seq_next,
 319         .stop   = rt_cpu_seq_stop,
 320         .show   = rt_cpu_seq_show,
 321 };
 322
 323
 324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325 {
 326         return seq_open(file, &rt_cpu_seq_ops);
 327 }
 328
 329 static const struct file_operations rt_cpu_seq_fops = {
 330         .open    = rt_cpu_seq_open,
 331         .read    = seq_read,
 332         .llseek  = seq_lseek,
 333         .release = seq_release,
 334 };
 335
 336 #ifdef CONFIG_IP_ROUTE_CLASSID
 337 static int rt_acct_proc_show(struct seq_file *m, void *v)
 338 {
 339         struct ip_rt_acct *dst, *src;
 340         unsigned int i, j;
 341
 342         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343         if (!dst)
 344                 return -ENOMEM;
 345
 346         for_each_possible_cpu(i) {
 347                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348                 for (j = 0; j < 256; j++) {
 349                         dst[j].o_bytes   += src[j].o_bytes;
 350                         dst[j].o_packets += src[j].o_packets;
 351                         dst[j].i_bytes   += src[j].i_bytes;
 352                         dst[j].i_packets += src[j].i_packets;
 353                 }
 354         }
 355
 356         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357         kfree(dst);
 358         return 0;
 359 }
 360 #endif
 361
 362 static int __net_init ip_rt_do_proc_init(struct net *net)
 363 {
 364         struct proc_dir_entry *pde;
 365
 366         pde = proc_create("rt_cache", 0444, net->proc_net,
 367                           &rt_cache_seq_fops);
 368         if (!pde)
 369                 goto err1;
 370
 371         pde = proc_create("rt_cache", 0444,
 372                           net->proc_net_stat, &rt_cpu_seq_fops);
 373         if (!pde)
 374                 goto err2;
 375
 376 #ifdef CONFIG_IP_ROUTE_CLASSID
 377         pde = proc_create_single("rt_acct", 0, net->proc_net,
 378                         rt_acct_proc_show);
 379         if (!pde)
 380                 goto err3;
 381 #endif
 382         return 0;
 383
 384 #ifdef CONFIG_IP_ROUTE_CLASSID
 385 err3:
 386         remove_proc_entry("rt_cache", net->proc_net_stat);
 387 #endif
 388 err2:
 389         remove_proc_entry("rt_cache", net->proc_net);
 390 err1:
 391         return -ENOMEM;
 392 }
 393
 394 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 395 {
 396         remove_proc_entry("rt_cache", net->proc_net_stat);
 397         remove_proc_entry("rt_cache", net->proc_net);
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399         remove_proc_entry("rt_acct", net->proc_net);
 400 #endif
 401 }
 402
 403 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 404         .init = ip_rt_do_proc_init,
 405         .exit = ip_rt_do_proc_exit,
 406 };
 407
 408 static int __init ip_rt_proc_init(void)
 409 {
 410         return register_pernet_subsys(&ip_rt_proc_ops);
 411 }
 412
 413 #else
 414 static inline int ip_rt_proc_init(void)
 415 {
 416         return 0;
 417 }
 418 #endif /* CONFIG_PROC_FS */
 419
 420 static inline bool rt_is_expired(const struct rtable *rth)
 421 {
 422         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 423 }
 424
 425 void rt_cache_flush(struct net *net)
 426 {
 427         rt_genid_bump_ipv4(net);
 428 }
 429
 430 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 431                                            struct sk_buff *skb,
 432                                            const void *daddr)
 433 {
 434         const struct rtable *rt = container_of(dst, struct rtable, dst);
 435         struct net_device *dev = dst->dev;
 436         struct neighbour *n;
 437
 438         rcu_read_lock_bh();
 439
 440         if (likely(rt->rt_gw_family == AF_INET)) {
 441                 n = ip_neigh_gw4(dev, rt->rt_gw4);
 442         } else if (rt->rt_gw_family == AF_INET6) {
 443                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
 444         } else {
 445                 __be32 pkey;
 446
 447                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 448                 n = ip_neigh_gw4(dev, pkey);
 449         }
 450
 451         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 452                 n = NULL;
 453
 454         rcu_read_unlock_bh();
 455
 456         return n;
 457 }
 458
 459 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 460 {
 461         const struct rtable *rt = container_of(dst, struct rtable, dst);
 462         struct net_device *dev = dst->dev;
 463         const __be32 *pkey = daddr;
 464
 465         if (rt->rt_gw_family == AF_INET) {
 466                 pkey = (const __be32 *)&rt->rt_gw4;
 467         } else if (rt->rt_gw_family == AF_INET6) {
 468                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 469         } else if (!daddr ||
 470                  (rt->rt_flags &
 471                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 472                 return;
 473         }
 474         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 475 }
 476
 477 #define IP_IDENTS_SZ 2048u
 478
 479 static atomic_t *ip_idents __read_mostly;
 480 static u32 *ip_tstamps __read_mostly;
 481
 482 /* In order to protect privacy, we add a perturbation to identifiers
 483  * if one generator is seldom used. This makes hard for an attacker
 484  * to infer how many packets were sent between two points in time.
 485  */
 486 u32 ip_idents_reserve(u32 hash, int segs)
 487 {
 488         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 489         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 490         u32 old = READ_ONCE(*p_tstamp);
 491         u32 now = (u32)jiffies;
 492         u32 new, delta = 0;
 493
 494         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 495                 delta = prandom_u32_max(now - old);
 496
 497         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 498         do {
 499                 old = (u32)atomic_read(p_id);
 500                 new = old + delta + segs;
 501         } while (atomic_cmpxchg(p_id, old, new) != old);
 502
 503         return new - segs;
 504 }
 505 EXPORT_SYMBOL(ip_idents_reserve);
 506
 507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508 {
 509         u32 hash, id;
 510
 511         /* Note the following code is not safe, but this is okay. */
 512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513                 get_random_bytes(&net->ipv4.ip_id_key,
 514                                  sizeof(net->ipv4.ip_id_key));
 515
 516         hash = siphash_3u32((__force u32)iph->daddr,
 517                             (__force u32)iph->saddr,
 518                             iph->protocol,
 519                             &net->ipv4.ip_id_key);
 520         id = ip_idents_reserve(hash, segs);
 521         iph->id = htons(id);
 522 }
 523 EXPORT_SYMBOL(__ip_select_ident);
 524
 525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526                              const struct sock *sk,
 527                              const struct iphdr *iph,
 528                              int oif, u8 tos,
 529                              u8 prot, u32 mark, int flow_flags)
 530 {
 531         if (sk) {
 532                 const struct inet_sock *inet = inet_sk(sk);
 533
 534                 oif = sk->sk_bound_dev_if;
 535                 mark = sk->sk_mark;
 536                 tos = RT_CONN_FLAGS(sk);
 537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538         }
 539         flowi4_init_output(fl4, oif, mark, tos,
 540                            RT_SCOPE_UNIVERSE, prot,
 541                            flow_flags,
 542                            iph->daddr, iph->saddr, 0, 0,
 543                            sock_net_uid(net, sk));
 544 }
 545
 546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547                                const struct sock *sk)
 548 {
 549         const struct net *net = dev_net(skb->dev);
 550         const struct iphdr *iph = ip_hdr(skb);
 551         int oif = skb->dev->ifindex;
 552         u8 tos = RT_TOS(iph->tos);
 553         u8 prot = iph->protocol;
 554         u32 mark = skb->mark;
 555
 556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557 }
 558
 559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560 {
 561         const struct inet_sock *inet = inet_sk(sk);
 562         const struct ip_options_rcu *inet_opt;
 563         __be32 daddr = inet->inet_daddr;
 564
 565         rcu_read_lock();
 566         inet_opt = rcu_dereference(inet->inet_opt);
 567         if (inet_opt && inet_opt->opt.srr)
 568                 daddr = inet_opt->opt.faddr;
 569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572                            inet_sk_flowi_flags(sk),
 573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574         rcu_read_unlock();
 575 }
 576
 577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578                                  const struct sk_buff *skb)
 579 {
 580         if (skb)
 581                 build_skb_flow_key(fl4, skb, sk);
 582         else
 583                 build_sk_flow_key(fl4, sk);
 584 }
 585
 586 static DEFINE_SPINLOCK(fnhe_lock);
 587
 588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589 {
 590         struct rtable *rt;
 591
 592         rt = rcu_dereference(fnhe->fnhe_rth_input);
 593         if (rt) {
 594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595                 dst_dev_put(&rt->dst);
 596                 dst_release(&rt->dst);
 597         }
 598         rt = rcu_dereference(fnhe->fnhe_rth_output);
 599         if (rt) {
 600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601                 dst_dev_put(&rt->dst);
 602                 dst_release(&rt->dst);
 603         }
 604 }
 605
 606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 607 {
 608         struct fib_nh_exception *fnhe, *oldest;
 609
 610         oldest = rcu_dereference(hash->chain);
 611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 614                         oldest = fnhe;
 615         }
 616         fnhe_flush_routes(oldest);
 617         return oldest;
 618 }
 619
 620 static inline u32 fnhe_hashfun(__be32 daddr)
 621 {
 622         static u32 fnhe_hashrnd __read_mostly;
 623         u32 hval;
 624
 625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627         return hash_32(hval, FNHE_HASH_SHIFT);
 628 }
 629
 630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631 {
 632         rt->rt_pmtu = fnhe->fnhe_pmtu;
 633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634         rt->dst.expires = fnhe->fnhe_expires;
 635
 636         if (fnhe->fnhe_gw) {
 637                 rt->rt_flags |= RTCF_REDIRECTED;
 638                 rt->rt_uses_gateway = 1;
 639                 rt->rt_gw_family = AF_INET;
 640                 rt->rt_gw4 = fnhe->fnhe_gw;
 641         }
 642 }
 643
 644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645                                   __be32 gw, u32 pmtu, bool lock,
 646                                   unsigned long expires)
 647 {
 648         struct fnhe_hash_bucket *hash;
 649         struct fib_nh_exception *fnhe;
 650         struct rtable *rt;
 651         u32 genid, hval;
 652         unsigned int i;
 653         int depth;
 654
 655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656         hval = fnhe_hashfun(daddr);
 657
 658         spin_lock_bh(&fnhe_lock);
 659
 660         hash = rcu_dereference(nhc->nhc_exceptions);
 661         if (!hash) {
 662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663                 if (!hash)
 664                         goto out_unlock;
 665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666         }
 667
 668         hash += hval;
 669
 670         depth = 0;
 671         for (fnhe = rcu_dereference(hash->chain); fnhe;
 672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                 if (fnhe->fnhe_daddr == daddr)
 674                         break;
 675                 depth++;
 676         }
 677
 678         if (fnhe) {
 679                 if (fnhe->fnhe_genid != genid)
 680                         fnhe->fnhe_genid = genid;
 681                 if (gw)
 682                         fnhe->fnhe_gw = gw;
 683                 if (pmtu) {
 684                         fnhe->fnhe_pmtu = pmtu;
 685                         fnhe->fnhe_mtu_locked = lock;
 686                 }
 687                 fnhe->fnhe_expires = max(1UL, expires);
 688                 /* Update all cached dsts too */
 689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 690                 if (rt)
 691                         fill_route_from_fnhe(rt, fnhe);
 692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 693                 if (rt)
 694                         fill_route_from_fnhe(rt, fnhe);
 695         } else {
 696                 if (depth > FNHE_RECLAIM_DEPTH)
 697                         fnhe = fnhe_oldest(hash);
 698                 else {
 699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700                         if (!fnhe)
 701                                 goto out_unlock;
 702
 703                         fnhe->fnhe_next = hash->chain;
 704                         rcu_assign_pointer(hash->chain, fnhe);
 705                 }
 706                 fnhe->fnhe_genid = genid;
 707                 fnhe->fnhe_daddr = daddr;
 708                 fnhe->fnhe_gw = gw;
 709                 fnhe->fnhe_pmtu = pmtu;
 710                 fnhe->fnhe_mtu_locked = lock;
 711                 fnhe->fnhe_expires = max(1UL, expires);
 712
 713                 /* Exception created; mark the cached routes for the nexthop
 714                  * stale, so anyone caching it rechecks if this exception
 715                  * applies to them.
 716                  */
 717                 rt = rcu_dereference(nhc->nhc_rth_input);
 718                 if (rt)
 719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 720
 721                 for_each_possible_cpu(i) {
 722                         struct rtable __rcu **prt;
 723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724                         rt = rcu_dereference(*prt);
 725                         if (rt)
 726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 727                 }
 728         }
 729
 730         fnhe->fnhe_stamp = jiffies;
 731
 732 out_unlock:
 733         spin_unlock_bh(&fnhe_lock);
 734 }
 735
 736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737                              bool kill_route)
 738 {
 739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 740         __be32 old_gw = ip_hdr(skb)->saddr;
 741         struct net_device *dev = skb->dev;
 742         struct in_device *in_dev;
 743         struct fib_result res;
 744         struct neighbour *n;
 745         struct net *net;
 746
 747         switch (icmp_hdr(skb)->code & 7) {
 748         case ICMP_REDIR_NET:
 749         case ICMP_REDIR_NETTOS:
 750         case ICMP_REDIR_HOST:
 751         case ICMP_REDIR_HOSTTOS:
 752                 break;
 753
 754         default:
 755                 return;
 756         }
 757
 758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 759                 return;
 760
 761         in_dev = __in_dev_get_rcu(dev);
 762         if (!in_dev)
 763                 return;
 764
 765         net = dev_net(dev);
 766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768             ipv4_is_zeronet(new_gw))
 769                 goto reject_redirect;
 770
 771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773                         goto reject_redirect;
 774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775                         goto reject_redirect;
 776         } else {
 777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778                         goto reject_redirect;
 779         }
 780
 781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782         if (!n)
 783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784         if (!IS_ERR(n)) {
 785                 if (!(n->nud_state & NUD_VALID)) {
 786                         neigh_event_send(n, NULL);
 787                 } else {
 788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 789                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
 790
 791                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 792                                                 0, false,
 793                                                 jiffies + ip_rt_gc_timeout);
 794                         }
 795                         if (kill_route)
 796                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 797                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 798                 }
 799                 neigh_release(n);
 800         }
 801         return;
 802
 803 reject_redirect:
 804 #ifdef CONFIG_IP_ROUTE_VERBOSE
 805         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 806                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 807                 __be32 daddr = iph->daddr;
 808                 __be32 saddr = iph->saddr;
 809
 810                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 811                                      "  Advised path = %pI4 -> %pI4\n",
 812                                      &old_gw, dev->name, &new_gw,
 813                                      &saddr, &daddr);
 814         }
 815 #endif
 816         ;
 817 }
 818
 819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 820 {
 821         struct rtable *rt;
 822         struct flowi4 fl4;
 823         const struct iphdr *iph = (const struct iphdr *) skb->data;
 824         struct net *net = dev_net(skb->dev);
 825         int oif = skb->dev->ifindex;
 826         u8 tos = RT_TOS(iph->tos);
 827         u8 prot = iph->protocol;
 828         u32 mark = skb->mark;
 829
 830         rt = (struct rtable *) dst;
 831
 832         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 833         __ip_do_redirect(rt, skb, &fl4, true);
 834 }
 835
 836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 837 {
 838         struct rtable *rt = (struct rtable *)dst;
 839         struct dst_entry *ret = dst;
 840
 841         if (rt) {
 842                 if (dst->obsolete > 0) {
 843                         ip_rt_put(rt);
 844                         ret = NULL;
 845                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 846                            rt->dst.expires) {
 847                         ip_rt_put(rt);
 848                         ret = NULL;
 849                 }
 850         }
 851         return ret;
 852 }
 853
 854 /*
 855  * Algorithm:
 856  *      1. The first ip_rt_redirect_number redirects are sent
 857  *         with exponential backoff, then we stop sending them at all,
 858  *         assuming that the host ignores our redirects.
 859  *      2. If we did not see packets requiring redirects
 860  *         during ip_rt_redirect_silence, we assume that the host
 861  *         forgot redirected route and start to send redirects again.
 862  *
 863  * This algorithm is much cheaper and more intelligent than dumb load limiting
 864  * in icmp.c.
 865  *
 866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 868  */
 869
 870 void ip_rt_send_redirect(struct sk_buff *skb)
 871 {
 872         struct rtable *rt = skb_rtable(skb);
 873         struct in_device *in_dev;
 874         struct inet_peer *peer;
 875         struct net *net;
 876         int log_martians;
 877         int vif;
 878
 879         rcu_read_lock();
 880         in_dev = __in_dev_get_rcu(rt->dst.dev);
 881         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 882                 rcu_read_unlock();
 883                 return;
 884         }
 885         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 886         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 887         rcu_read_unlock();
 888
 889         net = dev_net(rt->dst.dev);
 890         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 891         if (!peer) {
 892                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 893                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 894                 return;
 895         }
 896
 897         /* No redirected packets during ip_rt_redirect_silence;
 898          * reset the algorithm.
 899          */
 900         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 901                 peer->rate_tokens = 0;
 902                 peer->n_redirects = 0;
 903         }
 904
 905         /* Too many ignored redirects; do not send anything
 906          * set dst.rate_last to the last seen redirected packet.
 907          */
 908         if (peer->n_redirects >= ip_rt_redirect_number) {
 909                 peer->rate_last = jiffies;
 910                 goto out_put_peer;
 911         }
 912
 913         /* Check for load limit; set rate_last to the latest sent
 914          * redirect.
 915          */
 916         if (peer->rate_tokens == 0 ||
 917             time_after(jiffies,
 918                        (peer->rate_last +
 919                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 920                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923                 peer->rate_last = jiffies;
 924                 ++peer->rate_tokens;
 925                 ++peer->n_redirects;
 926 #ifdef CONFIG_IP_ROUTE_VERBOSE
 927                 if (log_martians &&
 928                     peer->rate_tokens == ip_rt_redirect_number)
 929                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 930                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 931                                              &ip_hdr(skb)->daddr, &gw);
 932 #endif
 933         }
 934 out_put_peer:
 935         inet_putpeer(peer);
 936 }
 937
 938 static int ip_error(struct sk_buff *skb)
 939 {
 940         struct rtable *rt = skb_rtable(skb);
 941         struct net_device *dev = skb->dev;
 942         struct in_device *in_dev;
 943         struct inet_peer *peer;
 944         unsigned long now;
 945         struct net *net;
 946         bool send;
 947         int code;
 948
 949         if (netif_is_l3_master(skb->dev)) {
 950                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 951                 if (!dev)
 952                         goto out;
 953         }
 954
 955         in_dev = __in_dev_get_rcu(dev);
 956
 957         /* IP on this device is disabled. */
 958         if (!in_dev)
 959                 goto out;
 960
 961         net = dev_net(rt->dst.dev);
 962         if (!IN_DEV_FORWARD(in_dev)) {
 963                 switch (rt->dst.error) {
 964                 case EHOSTUNREACH:
 965                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 966                         break;
 967
 968                 case ENETUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                         break;
 971                 }
 972                 goto out;
 973         }
 974
 975         switch (rt->dst.error) {
 976         case EINVAL:
 977         default:
 978                 goto out;
 979         case EHOSTUNREACH:
 980                 code = ICMP_HOST_UNREACH;
 981                 break;
 982         case ENETUNREACH:
 983                 code = ICMP_NET_UNREACH;
 984                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 985                 break;
 986         case EACCES:
 987                 code = ICMP_PKT_FILTERED;
 988                 break;
 989         }
 990
 991         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 992                                l3mdev_master_ifindex(skb->dev), 1);
 993
 994         send = true;
 995         if (peer) {
 996                 now = jiffies;
 997                 peer->rate_tokens += now - peer->rate_last;
 998                 if (peer->rate_tokens > ip_rt_error_burst)
 999                         peer->rate_tokens = ip_rt_error_burst;
1000                 peer->rate_last = now;
1001                 if (peer->rate_tokens >= ip_rt_error_cost)
1002                         peer->rate_tokens -= ip_rt_error_cost;
1003                 else
1004                         send = false;
1005                 inet_putpeer(peer);
1006         }
1007         if (send)
1008                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out:    kfree_skb(skb);
1011         return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016         struct dst_entry *dst = &rt->dst;
1017         u32 old_mtu = ipv4_mtu(dst);
1018         struct fib_result res;
1019         bool lock = false;
1020
1021         if (ip_mtu_locked(dst))
1022                 return;
1023
1024         if (old_mtu < mtu)
1025                 return;
1026
1027         if (mtu < ip_rt_min_pmtu) {
1028                 lock = true;
1029                 mtu = min(old_mtu, ip_rt_min_pmtu);
1030         }
1031
1032         if (rt->rt_pmtu == mtu && !lock &&
1033             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034                 return;
1035
1036         rcu_read_lock();
1037         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039
1040                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041                                       jiffies + ip_rt_mtu_expires);
1042         }
1043         rcu_read_unlock();
1044 }
1045
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047                               struct sk_buff *skb, u32 mtu)
1048 {
1049         struct rtable *rt = (struct rtable *) dst;
1050         struct flowi4 fl4;
1051
1052         ip_rt_build_flow_key(&fl4, sk, skb);
1053         __ip_rt_update_pmtu(rt, &fl4, mtu);
1054 }
1055
1056 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1057                       int oif, u8 protocol)
1058 {
1059         const struct iphdr *iph = (const struct iphdr *) skb->data;
1060         struct flowi4 fl4;
1061         struct rtable *rt;
1062         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1063
1064         __build_flow_key(net, &fl4, NULL, iph, oif,
1065                          RT_TOS(iph->tos), protocol, mark, 0);
1066         rt = __ip_route_output_key(net, &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
1074 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079
1080         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081
1082         if (!fl4.flowi4_mark)
1083                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
1085         rt = __ip_route_output_key(sock_net(sk), &fl4);
1086         if (!IS_ERR(rt)) {
1087                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1088                 ip_rt_put(rt);
1089         }
1090 }
1091
1092 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093 {
1094         const struct iphdr *iph = (const struct iphdr *) skb->data;
1095         struct flowi4 fl4;
1096         struct rtable *rt;
1097         struct dst_entry *odst = NULL;
1098         bool new = false;
1099         struct net *net = sock_net(sk);
1100
1101         bh_lock_sock(sk);
1102
1103         if (!ip_sk_accept_pmtu(sk))
1104                 goto out;
1105
1106         odst = sk_dst_get(sk);
1107
1108         if (sock_owned_by_user(sk) || !odst) {
1109                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1110                 goto out;
1111         }
1112
1113         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114
1115         rt = (struct rtable *)odst;
1116         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                 if (IS_ERR(rt))
1119                         goto out;
1120
1121                 new = true;
1122         }
1123
1124         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125
1126         if (!dst_check(&rt->dst, 0)) {
1127                 if (new)
1128                         dst_release(&rt->dst);
1129
1130                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                 if (IS_ERR(rt))
1132                         goto out;
1133
1134                 new = true;
1135         }
1136
1137         if (new)
1138                 sk_dst_set(sk, &rt->dst);
1139
1140 out:
1141         bh_unlock_sock(sk);
1142         dst_release(odst);
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145
1146 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147                    int oif, u8 protocol)
1148 {
1149         const struct iphdr *iph = (const struct iphdr *) skb->data;
1150         struct flowi4 fl4;
1151         struct rtable *rt;
1152
1153         __build_flow_key(net, &fl4, NULL, iph, oif,
1154                          RT_TOS(iph->tos), protocol, 0, 0);
1155         rt = __ip_route_output_key(net, &fl4);
1156         if (!IS_ERR(rt)) {
1157                 __ip_do_redirect(rt, skb, &fl4, false);
1158                 ip_rt_put(rt);
1159         }
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164 {
1165         const struct iphdr *iph = (const struct iphdr *) skb->data;
1166         struct flowi4 fl4;
1167         struct rtable *rt;
1168         struct net *net = sock_net(sk);
1169
1170         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171         rt = __ip_route_output_key(net, &fl4);
1172         if (!IS_ERR(rt)) {
1173                 __ip_do_redirect(rt, skb, &fl4, false);
1174                 ip_rt_put(rt);
1175         }
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
1179 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180 {
1181         struct rtable *rt = (struct rtable *) dst;
1182
1183         /* All IPV4 dsts are created with ->obsolete set to the value
1184          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185          * into this function always.
1186          *
1187          * When a PMTU/redirect information update invalidates a route,
1188          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189          * DST_OBSOLETE_DEAD.
1190          */
1191         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192                 return NULL;
1193         return dst;
1194 }
1195
1196 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1197 {
1198         struct ip_options opt;
1199         int res;
1200
1201         /* Recompile ip options since IPCB may not be valid anymore.
1202          * Also check we have a reasonable ipv4 header.
1203          */
1204         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1205             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1206                 return;
1207
1208         memset(&opt, 0, sizeof(opt));
1209         if (ip_hdr(skb)->ihl > 5) {
1210                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1211                         return;
1212                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1213
1214                 rcu_read_lock();
1215                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1216                 rcu_read_unlock();
1217
1218                 if (res)
1219                         return;
1220         }
1221         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1222 }
1223
1224 static void ipv4_link_failure(struct sk_buff *skb)
1225 {
1226         struct rtable *rt;
1227
1228         ipv4_send_dest_unreach(skb);
1229
1230         rt = skb_rtable(skb);
1231         if (rt)
1232                 dst_set_expires(&rt->dst, 0);
1233 }
1234
1235 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1236 {
1237         pr_debug("%s: %pI4 -> %pI4, %s\n",
1238                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1239                  skb->dev ? skb->dev->name : "?");
1240         kfree_skb(skb);
1241         WARN_ON(1);
1242         return 0;
1243 }
1244
1245 /*
1246    We do not cache source address of outgoing interface,
1247    because it is used only by IP RR, TS and SRR options,
1248    so that it out of fast path.
1249
1250    BTW remember: "addr" is allowed to be not aligned
1251    in IP options!
1252  */
1253
1254 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1255 {
1256         __be32 src;
1257
1258         if (rt_is_output_route(rt))
1259                 src = ip_hdr(skb)->saddr;
1260         else {
1261                 struct fib_result res;
1262                 struct iphdr *iph = ip_hdr(skb);
1263                 struct flowi4 fl4 = {
1264                         .daddr = iph->daddr,
1265                         .saddr = iph->saddr,
1266                         .flowi4_tos = RT_TOS(iph->tos),
1267                         .flowi4_oif = rt->dst.dev->ifindex,
1268                         .flowi4_iif = skb->dev->ifindex,
1269                         .flowi4_mark = skb->mark,
1270                 };
1271
1272                 rcu_read_lock();
1273                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1274                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1275                 else
1276                         src = inet_select_addr(rt->dst.dev,
1277                                                rt_nexthop(rt, iph->daddr),
1278                                                RT_SCOPE_UNIVERSE);
1279                 rcu_read_unlock();
1280         }
1281         memcpy(addr, &src, 4);
1282 }
1283
1284 #ifdef CONFIG_IP_ROUTE_CLASSID
1285 static void set_class_tag(struct rtable *rt, u32 tag)
1286 {
1287         if (!(rt->dst.tclassid & 0xFFFF))
1288                 rt->dst.tclassid |= tag & 0xFFFF;
1289         if (!(rt->dst.tclassid & 0xFFFF0000))
1290                 rt->dst.tclassid |= tag & 0xFFFF0000;
1291 }
1292 #endif
1293
1294 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1295 {
1296         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1297         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1298                                     ip_rt_min_advmss);
1299
1300         return min(advmss, IPV4_MAX_PMTU - header_size);
1301 }
1302
1303 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1304 {
1305         const struct rtable *rt = (const struct rtable *) dst;
1306         unsigned int mtu = rt->rt_pmtu;
1307
1308         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1309                 mtu = dst_metric_raw(dst, RTAX_MTU);
1310
1311         if (mtu)
1312                 return mtu;
1313
1314         mtu = READ_ONCE(dst->dev->mtu);
1315
1316         if (unlikely(ip_mtu_locked(dst))) {
1317                 if (rt->rt_uses_gateway && mtu > 576)
1318                         mtu = 576;
1319         }
1320
1321         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1322
1323         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1324 }
1325
1326 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1327 {
1328         struct fnhe_hash_bucket *hash;
1329         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1330         u32 hval = fnhe_hashfun(daddr);
1331
1332         spin_lock_bh(&fnhe_lock);
1333
1334         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1335                                          lockdep_is_held(&fnhe_lock));
1336         hash += hval;
1337
1338         fnhe_p = &hash->chain;
1339         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1340         while (fnhe) {
1341                 if (fnhe->fnhe_daddr == daddr) {
1342                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1343                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1344                         /* set fnhe_daddr to 0 to ensure it won't bind with
1345                          * new dsts in rt_bind_exception().
1346                          */
1347                         fnhe->fnhe_daddr = 0;
1348                         fnhe_flush_routes(fnhe);
1349                         kfree_rcu(fnhe, rcu);
1350                         break;
1351                 }
1352                 fnhe_p = &fnhe->fnhe_next;
1353                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1354                                                  lockdep_is_held(&fnhe_lock));
1355         }
1356
1357         spin_unlock_bh(&fnhe_lock);
1358 }
1359
1360 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1361                                                __be32 daddr)
1362 {
1363         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1364         struct fib_nh_exception *fnhe;
1365         u32 hval;
1366
1367         if (!hash)
1368                 return NULL;
1369
1370         hval = fnhe_hashfun(daddr);
1371
1372         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1373              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1374                 if (fnhe->fnhe_daddr == daddr) {
1375                         if (fnhe->fnhe_expires &&
1376                             time_after(jiffies, fnhe->fnhe_expires)) {
1377                                 ip_del_fnhe(nhc, daddr);
1378                                 break;
1379                         }
1380                         return fnhe;
1381                 }
1382         }
1383         return NULL;
1384 }
1385
1386 /* MTU selection:
1387  * 1. mtu on route is locked - use it
1388  * 2. mtu from nexthop exception
1389  * 3. mtu from egress device
1390  */
1391
1392 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1393 {
1394         struct fib_nh_common *nhc = res->nhc;
1395         struct net_device *dev = nhc->nhc_dev;
1396         struct fib_info *fi = res->fi;
1397         u32 mtu = 0;
1398
1399         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1400             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1401                 mtu = fi->fib_mtu;
1402
1403         if (likely(!mtu)) {
1404                 struct fib_nh_exception *fnhe;
1405
1406                 fnhe = find_exception(nhc, daddr);
1407                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1408                         mtu = fnhe->fnhe_pmtu;
1409         }
1410
1411         if (likely(!mtu))
1412                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1413
1414         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1415 }
1416
1417 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1418                               __be32 daddr, const bool do_cache)
1419 {
1420         bool ret = false;
1421
1422         spin_lock_bh(&fnhe_lock);
1423
1424         if (daddr == fnhe->fnhe_daddr) {
1425                 struct rtable __rcu **porig;
1426                 struct rtable *orig;
1427                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1428
1429                 if (rt_is_input_route(rt))
1430                         porig = &fnhe->fnhe_rth_input;
1431                 else
1432                         porig = &fnhe->fnhe_rth_output;
1433                 orig = rcu_dereference(*porig);
1434
1435                 if (fnhe->fnhe_genid != genid) {
1436                         fnhe->fnhe_genid = genid;
1437                         fnhe->fnhe_gw = 0;
1438                         fnhe->fnhe_pmtu = 0;
1439                         fnhe->fnhe_expires = 0;
1440                         fnhe->fnhe_mtu_locked = false;
1441                         fnhe_flush_routes(fnhe);
1442                         orig = NULL;
1443                 }
1444                 fill_route_from_fnhe(rt, fnhe);
1445                 if (!rt->rt_gw4) {
1446                         rt->rt_gw4 = daddr;
1447                         rt->rt_gw_family = AF_INET;
1448                 }
1449
1450                 if (do_cache) {
1451                         dst_hold(&rt->dst);
1452                         rcu_assign_pointer(*porig, rt);
1453                         if (orig) {
1454                                 dst_dev_put(&orig->dst);
1455                                 dst_release(&orig->dst);
1456                         }
1457                         ret = true;
1458                 }
1459
1460                 fnhe->fnhe_stamp = jiffies;
1461         }
1462         spin_unlock_bh(&fnhe_lock);
1463
1464         return ret;
1465 }
1466
1467 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1468 {
1469         struct rtable *orig, *prev, **p;
1470         bool ret = true;
1471
1472         if (rt_is_input_route(rt)) {
1473                 p = (struct rtable **)&nhc->nhc_rth_input;
1474         } else {
1475                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1476         }
1477         orig = *p;
1478
1479         /* hold dst before doing cmpxchg() to avoid race condition
1480          * on this dst
1481          */
1482         dst_hold(&rt->dst);
1483         prev = cmpxchg(p, orig, rt);
1484         if (prev == orig) {
1485                 if (orig) {
1486                         dst_dev_put(&orig->dst);
1487                         dst_release(&orig->dst);
1488                 }
1489         } else {
1490                 dst_release(&rt->dst);
1491                 ret = false;
1492         }
1493
1494         return ret;
1495 }
1496
1497 struct uncached_list {
1498         spinlock_t              lock;
1499         struct list_head        head;
1500 };
1501
1502 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1503
1504 void rt_add_uncached_list(struct rtable *rt)
1505 {
1506         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1507
1508         rt->rt_uncached_list = ul;
1509
1510         spin_lock_bh(&ul->lock);
1511         list_add_tail(&rt->rt_uncached, &ul->head);
1512         spin_unlock_bh(&ul->lock);
1513 }
1514
1515 void rt_del_uncached_list(struct rtable *rt)
1516 {
1517         if (!list_empty(&rt->rt_uncached)) {
1518                 struct uncached_list *ul = rt->rt_uncached_list;
1519
1520                 spin_lock_bh(&ul->lock);
1521                 list_del(&rt->rt_uncached);
1522                 spin_unlock_bh(&ul->lock);
1523         }
1524 }
1525
1526 static void ipv4_dst_destroy(struct dst_entry *dst)
1527 {
1528         struct rtable *rt = (struct rtable *)dst;
1529
1530         ip_dst_metrics_put(dst);
1531         rt_del_uncached_list(rt);
1532 }
1533
1534 void rt_flush_dev(struct net_device *dev)
1535 {
1536         struct rtable *rt;
1537         int cpu;
1538
1539         for_each_possible_cpu(cpu) {
1540                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1541
1542                 spin_lock_bh(&ul->lock);
1543                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1544                         if (rt->dst.dev != dev)
1545                                 continue;
1546                         rt->dst.dev = blackhole_netdev;
1547                         dev_hold(rt->dst.dev);
1548                         dev_put(dev);
1549                 }
1550                 spin_unlock_bh(&ul->lock);
1551         }
1552 }
1553
1554 static bool rt_cache_valid(const struct rtable *rt)
1555 {
1556         return  rt &&
1557                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1558                 !rt_is_expired(rt);
1559 }
1560
1561 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1562                            const struct fib_result *res,
1563                            struct fib_nh_exception *fnhe,
1564                            struct fib_info *fi, u16 type, u32 itag,
1565                            const bool do_cache)
1566 {
1567         bool cached = false;
1568
1569         if (fi) {
1570                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1571
1572                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1573                         rt->rt_uses_gateway = 1;
1574                         rt->rt_gw_family = nhc->nhc_gw_family;
1575                         /* only INET and INET6 are supported */
1576                         if (likely(nhc->nhc_gw_family == AF_INET))
1577                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1578                         else
1579                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1580                 }
1581
1582                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1583
1584 #ifdef CONFIG_IP_ROUTE_CLASSID
1585                 if (nhc->nhc_family == AF_INET) {
1586                         struct fib_nh *nh;
1587
1588                         nh = container_of(nhc, struct fib_nh, nh_common);
1589                         rt->dst.tclassid = nh->nh_tclassid;
1590                 }
1591 #endif
1592                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1593                 if (unlikely(fnhe))
1594                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1595                 else if (do_cache)
1596                         cached = rt_cache_route(nhc, rt);
1597                 if (unlikely(!cached)) {
1598                         /* Routes we intend to cache in nexthop exception or
1599                          * FIB nexthop have the DST_NOCACHE bit clear.
1600                          * However, if we are unsuccessful at storing this
1601                          * route into the cache we really need to set it.
1602                          */
1603                         if (!rt->rt_gw4) {
1604                                 rt->rt_gw_family = AF_INET;
1605                                 rt->rt_gw4 = daddr;
1606                         }
1607                         rt_add_uncached_list(rt);
1608                 }
1609         } else
1610                 rt_add_uncached_list(rt);
1611
1612 #ifdef CONFIG_IP_ROUTE_CLASSID
1613 #ifdef CONFIG_IP_MULTIPLE_TABLES
1614         set_class_tag(rt, res->tclassid);
1615 #endif
1616         set_class_tag(rt, itag);
1617 #endif
1618 }
1619
1620 struct rtable *rt_dst_alloc(struct net_device *dev,
1621                             unsigned int flags, u16 type,
1622                             bool nopolicy, bool noxfrm, bool will_cache)
1623 {
1624         struct rtable *rt;
1625
1626         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1627                        (will_cache ? 0 : DST_HOST) |
1628                        (nopolicy ? DST_NOPOLICY : 0) |
1629                        (noxfrm ? DST_NOXFRM : 0));
1630
1631         if (rt) {
1632                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1633                 rt->rt_flags = flags;
1634                 rt->rt_type = type;
1635                 rt->rt_is_input = 0;
1636                 rt->rt_iif = 0;
1637                 rt->rt_pmtu = 0;
1638                 rt->rt_mtu_locked = 0;
1639                 rt->rt_uses_gateway = 0;
1640                 rt->rt_gw_family = 0;
1641                 rt->rt_gw4 = 0;
1642                 INIT_LIST_HEAD(&rt->rt_uncached);
1643
1644                 rt->dst.output = ip_output;
1645                 if (flags & RTCF_LOCAL)
1646                         rt->dst.input = ip_local_deliver;
1647         }
1648
1649         return rt;
1650 }
1651 EXPORT_SYMBOL(rt_dst_alloc);
1652
1653 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1654 {
1655         struct rtable *new_rt;
1656
1657         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1658                            rt->dst.flags);
1659
1660         if (new_rt) {
1661                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1662                 new_rt->rt_flags = rt->rt_flags;
1663                 new_rt->rt_type = rt->rt_type;
1664                 new_rt->rt_is_input = rt->rt_is_input;
1665                 new_rt->rt_iif = rt->rt_iif;
1666                 new_rt->rt_pmtu = rt->rt_pmtu;
1667                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1668                 new_rt->rt_gw_family = rt->rt_gw_family;
1669                 if (rt->rt_gw_family == AF_INET)
1670                         new_rt->rt_gw4 = rt->rt_gw4;
1671                 else if (rt->rt_gw_family == AF_INET6)
1672                         new_rt->rt_gw6 = rt->rt_gw6;
1673                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1674
1675                 new_rt->dst.flags |= DST_HOST;
1676                 new_rt->dst.input = rt->dst.input;
1677                 new_rt->dst.output = rt->dst.output;
1678                 new_rt->dst.error = rt->dst.error;
1679                 new_rt->dst.lastuse = jiffies;
1680                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1681         }
1682         return new_rt;
1683 }
1684 EXPORT_SYMBOL(rt_dst_clone);
1685
1686 /* called in rcu_read_lock() section */
1687 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688                           u8 tos, struct net_device *dev,
1689                           struct in_device *in_dev, u32 *itag)
1690 {
1691         int err;
1692
1693         /* Primary sanity checks. */
1694         if (!in_dev)
1695                 return -EINVAL;
1696
1697         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1698             skb->protocol != htons(ETH_P_IP))
1699                 return -EINVAL;
1700
1701         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1702                 return -EINVAL;
1703
1704         if (ipv4_is_zeronet(saddr)) {
1705                 if (!ipv4_is_local_multicast(daddr) &&
1706                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1707                         return -EINVAL;
1708         } else {
1709                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1710                                           in_dev, itag);
1711                 if (err < 0)
1712                         return err;
1713         }
1714         return 0;
1715 }
1716
1717 /* called in rcu_read_lock() section */
1718 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719                              u8 tos, struct net_device *dev, int our)
1720 {
1721         struct in_device *in_dev = __in_dev_get_rcu(dev);
1722         unsigned int flags = RTCF_MULTICAST;
1723         struct rtable *rth;
1724         u32 itag = 0;
1725         int err;
1726
1727         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1728         if (err)
1729                 return err;
1730
1731         if (our)
1732                 flags |= RTCF_LOCAL;
1733
1734         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1735                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1736         if (!rth)
1737                 return -ENOBUFS;
1738
1739 #ifdef CONFIG_IP_ROUTE_CLASSID
1740         rth->dst.tclassid = itag;
1741 #endif
1742         rth->dst.output = ip_rt_bug;
1743         rth->rt_is_input= 1;
1744
1745 #ifdef CONFIG_IP_MROUTE
1746         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1747                 rth->dst.input = ip_mr_input;
1748 #endif
1749         RT_CACHE_STAT_INC(in_slow_mc);
1750
1751         skb_dst_set(skb, &rth->dst);
1752         return 0;
1753 }
1754
1755
1756 static void ip_handle_martian_source(struct net_device *dev,
1757                                      struct in_device *in_dev,
1758                                      struct sk_buff *skb,
1759                                      __be32 daddr,
1760                                      __be32 saddr)
1761 {
1762         RT_CACHE_STAT_INC(in_martian_src);
1763 #ifdef CONFIG_IP_ROUTE_VERBOSE
1764         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1765                 /*
1766                  *      RFC1812 recommendation, if source is martian,
1767                  *      the only hint is MAC header.
1768                  */
1769                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1770                         &daddr, &saddr, dev->name);
1771                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1772                         print_hex_dump(KERN_WARNING, "ll header: ",
1773                                        DUMP_PREFIX_OFFSET, 16, 1,
1774                                        skb_mac_header(skb),
1775                                        dev->hard_header_len, false);
1776                 }
1777         }
1778 #endif
1779 }
1780
1781 /* called in rcu_read_lock() section */
1782 static int __mkroute_input(struct sk_buff *skb,
1783                            const struct fib_result *res,
1784                            struct in_device *in_dev,
1785                            __be32 daddr, __be32 saddr, u32 tos)
1786 {
1787         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1788         struct net_device *dev = nhc->nhc_dev;
1789         struct fib_nh_exception *fnhe;
1790         struct rtable *rth;
1791         int err;
1792         struct in_device *out_dev;
1793         bool do_cache;
1794         u32 itag = 0;
1795
1796         /* get a working reference to the output device */
1797         out_dev = __in_dev_get_rcu(dev);
1798         if (!out_dev) {
1799                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1800                 return -EINVAL;
1801         }
1802
1803         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1804                                   in_dev->dev, in_dev, &itag);
1805         if (err < 0) {
1806                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1807                                          saddr);
1808
1809                 goto cleanup;
1810         }
1811
1812         do_cache = res->fi && !itag;
1813         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1814             skb->protocol == htons(ETH_P_IP)) {
1815                 __be32 gw;
1816
1817                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1818                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1819                     inet_addr_onlink(out_dev, saddr, gw))
1820                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1821         }
1822
1823         if (skb->protocol != htons(ETH_P_IP)) {
1824                 /* Not IP (i.e. ARP). Do not create route, if it is
1825                  * invalid for proxy arp. DNAT routes are always valid.
1826                  *
1827                  * Proxy arp feature have been extended to allow, ARP
1828                  * replies back to the same interface, to support
1829                  * Private VLAN switch technologies. See arp.c.
1830                  */
1831                 if (out_dev == in_dev &&
1832                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1833                         err = -EINVAL;
1834                         goto cleanup;
1835                 }
1836         }
1837
1838         fnhe = find_exception(nhc, daddr);
1839         if (do_cache) {
1840                 if (fnhe)
1841                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1842                 else
1843                         rth = rcu_dereference(nhc->nhc_rth_input);
1844                 if (rt_cache_valid(rth)) {
1845                         skb_dst_set_noref(skb, &rth->dst);
1846                         goto out;
1847                 }
1848         }
1849
1850         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1851                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1852                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1853         if (!rth) {
1854                 err = -ENOBUFS;
1855                 goto cleanup;
1856         }
1857
1858         rth->rt_is_input = 1;
1859         RT_CACHE_STAT_INC(in_slow_tot);
1860
1861         rth->dst.input = ip_forward;
1862
1863         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1864                        do_cache);
1865         lwtunnel_set_redirect(&rth->dst);
1866         skb_dst_set(skb, &rth->dst);
1867 out:
1868         err = 0;
1869  cleanup:
1870         return err;
1871 }
1872
1873 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1874 /* To make ICMP packets follow the right flow, the multipath hash is
1875  * calculated from the inner IP addresses.
1876  */
1877 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1878                                  struct flow_keys *hash_keys)
1879 {
1880         const struct iphdr *outer_iph = ip_hdr(skb);
1881         const struct iphdr *key_iph = outer_iph;
1882         const struct iphdr *inner_iph;
1883         const struct icmphdr *icmph;
1884         struct iphdr _inner_iph;
1885         struct icmphdr _icmph;
1886
1887         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1888                 goto out;
1889
1890         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1891                 goto out;
1892
1893         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1894                                    &_icmph);
1895         if (!icmph)
1896                 goto out;
1897
1898         if (icmph->type != ICMP_DEST_UNREACH &&
1899             icmph->type != ICMP_REDIRECT &&
1900             icmph->type != ICMP_TIME_EXCEEDED &&
1901             icmph->type != ICMP_PARAMETERPROB)
1902                 goto out;
1903
1904         inner_iph = skb_header_pointer(skb,
1905                                        outer_iph->ihl * 4 + sizeof(_icmph),
1906                                        sizeof(_inner_iph), &_inner_iph);
1907         if (!inner_iph)
1908                 goto out;
1909
1910         key_iph = inner_iph;
1911 out:
1912         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1913         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1914 }
1915
1916 /* if skb is set it will be used and fl4 can be NULL */
1917 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1918                        const struct sk_buff *skb, struct flow_keys *flkeys)
1919 {
1920         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1921         struct flow_keys hash_keys;
1922         u32 mhash;
1923
1924         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1925         case 0:
1926                 memset(&hash_keys, 0, sizeof(hash_keys));
1927                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1928                 if (skb) {
1929                         ip_multipath_l3_keys(skb, &hash_keys);
1930                 } else {
1931                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1932                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1933                 }
1934                 break;
1935         case 1:
1936                 /* skb is currently provided only when forwarding */
1937                 if (skb) {
1938                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1939                         struct flow_keys keys;
1940
1941                         /* short-circuit if we already have L4 hash present */
1942                         if (skb->l4_hash)
1943                                 return skb_get_hash_raw(skb) >> 1;
1944
1945                         memset(&hash_keys, 0, sizeof(hash_keys));
1946
1947                         if (!flkeys) {
1948                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1949                                 flkeys = &keys;
1950                         }
1951
1952                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1953                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1954                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1955                         hash_keys.ports.src = flkeys->ports.src;
1956                         hash_keys.ports.dst = flkeys->ports.dst;
1957                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1958                 } else {
1959                         memset(&hash_keys, 0, sizeof(hash_keys));
1960                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1961                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1962                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1963                         hash_keys.ports.src = fl4->fl4_sport;
1964                         hash_keys.ports.dst = fl4->fl4_dport;
1965                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1966                 }
1967                 break;
1968         case 2:
1969                 memset(&hash_keys, 0, sizeof(hash_keys));
1970                 /* skb is currently provided only when forwarding */
1971                 if (skb) {
1972                         struct flow_keys keys;
1973
1974                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1975                         /* Inner can be v4 or v6 */
1976                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1977                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1980                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1981                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1982                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1983                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1984                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1985                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1986                         } else {
1987                                 /* Same as case 0 */
1988                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                                 ip_multipath_l3_keys(skb, &hash_keys);
1990                         }
1991                 } else {
1992                         /* Same as case 0 */
1993                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1995                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1996                 }
1997                 break;
1998         }
1999         mhash = flow_hash_from_keys(&hash_keys);
2000
2001         if (multipath_hash)
2002                 mhash = jhash_2words(mhash, multipath_hash, 0);
2003
2004         return mhash >> 1;
2005 }
2006 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2007
2008 static int ip_mkroute_input(struct sk_buff *skb,
2009                             struct fib_result *res,
2010                             struct in_device *in_dev,
2011                             __be32 daddr, __be32 saddr, u32 tos,
2012                             struct flow_keys *hkeys)
2013 {
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015         if (res->fi && fib_info_num_path(res->fi) > 1) {
2016                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2017
2018                 fib_select_multipath(res, h);
2019         }
2020 #endif
2021
2022         /* create a routing cache entry */
2023         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2024 }
2025
2026 /*
2027  *      NOTE. We drop all the packets that has local source
2028  *      addresses, because every properly looped back packet
2029  *      must have correct destination already attached by output routine.
2030  *
2031  *      Such approach solves two big problems:
2032  *      1. Not simplex devices are handled properly.
2033  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2034  *      called with rcu_read_lock()
2035  */
2036
2037 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2038                                u8 tos, struct net_device *dev,
2039                                struct fib_result *res)
2040 {
2041         struct in_device *in_dev = __in_dev_get_rcu(dev);
2042         struct flow_keys *flkeys = NULL, _flkeys;
2043         struct net    *net = dev_net(dev);
2044         struct ip_tunnel_info *tun_info;
2045         int             err = -EINVAL;
2046         unsigned int    flags = 0;
2047         u32             itag = 0;
2048         struct rtable   *rth;
2049         struct flowi4   fl4;
2050         bool do_cache = true;
2051
2052         /* IP on this device is disabled. */
2053
2054         if (!in_dev)
2055                 goto out;
2056
2057         /* Check for the most weird martians, which can be not detected
2058            by fib_lookup.
2059          */
2060
2061         tun_info = skb_tunnel_info(skb);
2062         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2063                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2064         else
2065                 fl4.flowi4_tun_key.tun_id = 0;
2066         skb_dst_drop(skb);
2067
2068         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2069                 goto martian_source;
2070
2071         res->fi = NULL;
2072         res->table = NULL;
2073         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2074                 goto brd_input;
2075
2076         /* Accept zero addresses only to limited broadcast;
2077          * I even do not know to fix it or not. Waiting for complains :-)
2078          */
2079         if (ipv4_is_zeronet(saddr))
2080                 goto martian_source;
2081
2082         if (ipv4_is_zeronet(daddr))
2083                 goto martian_destination;
2084
2085         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2086          * and call it once if daddr or/and saddr are loopback addresses
2087          */
2088         if (ipv4_is_loopback(daddr)) {
2089                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2090                         goto martian_destination;
2091         } else if (ipv4_is_loopback(saddr)) {
2092                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2093                         goto martian_source;
2094         }
2095
2096         /*
2097          *      Now we are ready to route packet.
2098          */
2099         fl4.flowi4_oif = 0;
2100         fl4.flowi4_iif = dev->ifindex;
2101         fl4.flowi4_mark = skb->mark;
2102         fl4.flowi4_tos = tos;
2103         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2104         fl4.flowi4_flags = 0;
2105         fl4.daddr = daddr;
2106         fl4.saddr = saddr;
2107         fl4.flowi4_uid = sock_net_uid(net, NULL);
2108
2109         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2110                 flkeys = &_flkeys;
2111         } else {
2112                 fl4.flowi4_proto = 0;
2113                 fl4.fl4_sport = 0;
2114                 fl4.fl4_dport = 0;
2115         }
2116
2117         err = fib_lookup(net, &fl4, res, 0);
2118         if (err != 0) {
2119                 if (!IN_DEV_FORWARD(in_dev))
2120                         err = -EHOSTUNREACH;
2121                 goto no_route;
2122         }
2123
2124         if (res->type == RTN_BROADCAST) {
2125                 if (IN_DEV_BFORWARD(in_dev))
2126                         goto make_route;
2127                 /* not do cache if bc_forwarding is enabled */
2128                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2129                         do_cache = false;
2130                 goto brd_input;
2131         }
2132
2133         if (res->type == RTN_LOCAL) {
2134                 err = fib_validate_source(skb, saddr, daddr, tos,
2135                                           0, dev, in_dev, &itag);
2136                 if (err < 0)
2137                         goto martian_source;
2138                 goto local_input;
2139         }
2140
2141         if (!IN_DEV_FORWARD(in_dev)) {
2142                 err = -EHOSTUNREACH;
2143                 goto no_route;
2144         }
2145         if (res->type != RTN_UNICAST)
2146                 goto martian_destination;
2147
2148 make_route:
2149         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2150 out:    return err;
2151
2152 brd_input:
2153         if (skb->protocol != htons(ETH_P_IP))
2154                 goto e_inval;
2155
2156         if (!ipv4_is_zeronet(saddr)) {
2157                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2158                                           in_dev, &itag);
2159                 if (err < 0)
2160                         goto martian_source;
2161         }
2162         flags |= RTCF_BROADCAST;
2163         res->type = RTN_BROADCAST;
2164         RT_CACHE_STAT_INC(in_brd);
2165
2166 local_input:
2167         do_cache &= res->fi && !itag;
2168         if (do_cache) {
2169                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2170
2171                 rth = rcu_dereference(nhc->nhc_rth_input);
2172                 if (rt_cache_valid(rth)) {
2173                         skb_dst_set_noref(skb, &rth->dst);
2174                         err = 0;
2175                         goto out;
2176                 }
2177         }
2178
2179         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2180                            flags | RTCF_LOCAL, res->type,
2181                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2182         if (!rth)
2183                 goto e_nobufs;
2184
2185         rth->dst.output= ip_rt_bug;
2186 #ifdef CONFIG_IP_ROUTE_CLASSID
2187         rth->dst.tclassid = itag;
2188 #endif
2189         rth->rt_is_input = 1;
2190
2191         RT_CACHE_STAT_INC(in_slow_tot);
2192         if (res->type == RTN_UNREACHABLE) {
2193                 rth->dst.input= ip_error;
2194                 rth->dst.error= -err;
2195                 rth->rt_flags   &= ~RTCF_LOCAL;
2196         }
2197
2198         if (do_cache) {
2199                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2200
2201                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2202                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2203                         WARN_ON(rth->dst.input == lwtunnel_input);
2204                         rth->dst.lwtstate->orig_input = rth->dst.input;
2205                         rth->dst.input = lwtunnel_input;
2206                 }
2207
2208                 if (unlikely(!rt_cache_route(nhc, rth)))
2209                         rt_add_uncached_list(rth);
2210         }
2211         skb_dst_set(skb, &rth->dst);
2212         err = 0;
2213         goto out;
2214
2215 no_route:
2216         RT_CACHE_STAT_INC(in_no_route);
2217         res->type = RTN_UNREACHABLE;
2218         res->fi = NULL;
2219         res->table = NULL;
2220         goto local_input;
2221
2222         /*
2223          *      Do not cache martian addresses: they should be logged (RFC1812)
2224          */
2225 martian_destination:
2226         RT_CACHE_STAT_INC(in_martian_dst);
2227 #ifdef CONFIG_IP_ROUTE_VERBOSE
2228         if (IN_DEV_LOG_MARTIANS(in_dev))
2229                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2230                                      &daddr, &saddr, dev->name);
2231 #endif
2232
2233 e_inval:
2234         err = -EINVAL;
2235         goto out;
2236
2237 e_nobufs:
2238         err = -ENOBUFS;
2239         goto out;
2240
2241 martian_source:
2242         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2243         goto out;
2244 }
2245
2246 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247                          u8 tos, struct net_device *dev)
2248 {
2249         struct fib_result res;
2250         int err;
2251
2252         tos &= IPTOS_RT_MASK;
2253         rcu_read_lock();
2254         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2255         rcu_read_unlock();
2256
2257         return err;
2258 }
2259 EXPORT_SYMBOL(ip_route_input_noref);
2260
2261 /* called with rcu_read_lock held */
2262 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263                        u8 tos, struct net_device *dev, struct fib_result *res)
2264 {
2265         /* Multicast recognition logic is moved from route cache to here.
2266            The problem was that too many Ethernet cards have broken/missing
2267            hardware multicast filters :-( As result the host on multicasting
2268            network acquires a lot of useless route cache entries, sort of
2269            SDR messages from all the world. Now we try to get rid of them.
2270            Really, provided software IP multicast filter is organized
2271            reasonably (at least, hashed), it does not result in a slowdown
2272            comparing with route cache reject entries.
2273            Note, that multicast routers are not affected, because
2274            route cache entry is created eventually.
2275          */
2276         if (ipv4_is_multicast(daddr)) {
2277                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2278                 int our = 0;
2279                 int err = -EINVAL;
2280
2281                 if (!in_dev)
2282                         return err;
2283                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2284                                       ip_hdr(skb)->protocol);
2285
2286                 /* check l3 master if no match yet */
2287                 if (!our && netif_is_l3_slave(dev)) {
2288                         struct in_device *l3_in_dev;
2289
2290                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2291                         if (l3_in_dev)
2292                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2293                                                       ip_hdr(skb)->protocol);
2294                 }
2295
2296                 if (our
2297 #ifdef CONFIG_IP_MROUTE
2298                         ||
2299                     (!ipv4_is_local_multicast(daddr) &&
2300                      IN_DEV_MFORWARD(in_dev))
2301 #endif
2302                    ) {
2303                         err = ip_route_input_mc(skb, daddr, saddr,
2304                                                 tos, dev, our);
2305                 }
2306                 return err;
2307         }
2308
2309         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2310 }
2311
2312 /* called with rcu_read_lock() */
2313 static struct rtable *__mkroute_output(const struct fib_result *res,
2314                                        const struct flowi4 *fl4, int orig_oif,
2315                                        struct net_device *dev_out,
2316                                        unsigned int flags)
2317 {
2318         struct fib_info *fi = res->fi;
2319         struct fib_nh_exception *fnhe;
2320         struct in_device *in_dev;
2321         u16 type = res->type;
2322         struct rtable *rth;
2323         bool do_cache;
2324
2325         in_dev = __in_dev_get_rcu(dev_out);
2326         if (!in_dev)
2327                 return ERR_PTR(-EINVAL);
2328
2329         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2330                 if (ipv4_is_loopback(fl4->saddr) &&
2331                     !(dev_out->flags & IFF_LOOPBACK) &&
2332                     !netif_is_l3_master(dev_out))
2333                         return ERR_PTR(-EINVAL);
2334
2335         if (ipv4_is_lbcast(fl4->daddr))
2336                 type = RTN_BROADCAST;
2337         else if (ipv4_is_multicast(fl4->daddr))
2338                 type = RTN_MULTICAST;
2339         else if (ipv4_is_zeronet(fl4->daddr))
2340                 return ERR_PTR(-EINVAL);
2341
2342         if (dev_out->flags & IFF_LOOPBACK)
2343                 flags |= RTCF_LOCAL;
2344
2345         do_cache = true;
2346         if (type == RTN_BROADCAST) {
2347                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2348                 fi = NULL;
2349         } else if (type == RTN_MULTICAST) {
2350                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2351                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2352                                      fl4->flowi4_proto))
2353                         flags &= ~RTCF_LOCAL;
2354                 else
2355                         do_cache = false;
2356                 /* If multicast route do not exist use
2357                  * default one, but do not gateway in this case.
2358                  * Yes, it is hack.
2359                  */
2360                 if (fi && res->prefixlen < 4)
2361                         fi = NULL;
2362         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2363                    (orig_oif != dev_out->ifindex)) {
2364                 /* For local routes that require a particular output interface
2365                  * we do not want to cache the result.  Caching the result
2366                  * causes incorrect behaviour when there are multiple source
2367                  * addresses on the interface, the end result being that if the
2368                  * intended recipient is waiting on that interface for the
2369                  * packet he won't receive it because it will be delivered on
2370                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2371                  * be set to the loopback interface as well.
2372                  */
2373                 do_cache = false;
2374         }
2375
2376         fnhe = NULL;
2377         do_cache &= fi != NULL;
2378         if (fi) {
2379                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2380                 struct rtable __rcu **prth;
2381
2382                 fnhe = find_exception(nhc, fl4->daddr);
2383                 if (!do_cache)
2384                         goto add;
2385                 if (fnhe) {
2386                         prth = &fnhe->fnhe_rth_output;
2387                 } else {
2388                         if (unlikely(fl4->flowi4_flags &
2389                                      FLOWI_FLAG_KNOWN_NH &&
2390                                      !(nhc->nhc_gw_family &&
2391                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2392                                 do_cache = false;
2393                                 goto add;
2394                         }
2395                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2396                 }
2397                 rth = rcu_dereference(*prth);
2398                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2399                         return rth;
2400         }
2401
2402 add:
2403         rth = rt_dst_alloc(dev_out, flags, type,
2404                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2405                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2406                            do_cache);
2407         if (!rth)
2408                 return ERR_PTR(-ENOBUFS);
2409
2410         rth->rt_iif = orig_oif;
2411
2412         RT_CACHE_STAT_INC(out_slow_tot);
2413
2414         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2415                 if (flags & RTCF_LOCAL &&
2416                     !(dev_out->flags & IFF_LOOPBACK)) {
2417                         rth->dst.output = ip_mc_output;
2418                         RT_CACHE_STAT_INC(out_slow_mc);
2419                 }
2420 #ifdef CONFIG_IP_MROUTE
2421                 if (type == RTN_MULTICAST) {
2422                         if (IN_DEV_MFORWARD(in_dev) &&
2423                             !ipv4_is_local_multicast(fl4->daddr)) {
2424                                 rth->dst.input = ip_mr_input;
2425                                 rth->dst.output = ip_mc_output;
2426                         }
2427                 }
2428 #endif
2429         }
2430
2431         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2432         lwtunnel_set_redirect(&rth->dst);
2433
2434         return rth;
2435 }
2436
2437 /*
2438  * Major route resolver routine.
2439  */
2440
2441 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2442                                         const struct sk_buff *skb)
2443 {
2444         __u8 tos = RT_FL_TOS(fl4);
2445         struct fib_result res = {
2446                 .type           = RTN_UNSPEC,
2447                 .fi             = NULL,
2448                 .table          = NULL,
2449                 .tclassid       = 0,
2450         };
2451         struct rtable *rth;
2452
2453         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2454         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2455         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2456                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2457
2458         rcu_read_lock();
2459         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2460         rcu_read_unlock();
2461
2462         return rth;
2463 }
2464 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2465
2466 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2467                                             struct fib_result *res,
2468                                             const struct sk_buff *skb)
2469 {
2470         struct net_device *dev_out = NULL;
2471         int orig_oif = fl4->flowi4_oif;
2472         unsigned int flags = 0;
2473         struct rtable *rth;
2474         int err = -ENETUNREACH;
2475
2476         if (fl4->saddr) {
2477                 rth = ERR_PTR(-EINVAL);
2478                 if (ipv4_is_multicast(fl4->saddr) ||
2479                     ipv4_is_lbcast(fl4->saddr) ||
2480                     ipv4_is_zeronet(fl4->saddr))
2481                         goto out;
2482
2483                 /* I removed check for oif == dev_out->oif here.
2484                    It was wrong for two reasons:
2485                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2486                       is assigned to multiple interfaces.
2487                    2. Moreover, we are allowed to send packets with saddr
2488                       of another iface. --ANK
2489                  */
2490
2491                 if (fl4->flowi4_oif == 0 &&
2492                     (ipv4_is_multicast(fl4->daddr) ||
2493                      ipv4_is_lbcast(fl4->daddr))) {
2494                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2495                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2496                         if (!dev_out)
2497                                 goto out;
2498
2499                         /* Special hack: user can direct multicasts
2500                            and limited broadcast via necessary interface
2501                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2502                            This hack is not just for fun, it allows
2503                            vic,vat and friends to work.
2504                            They bind socket to loopback, set ttl to zero
2505                            and expect that it will work.
2506                            From the viewpoint of routing cache they are broken,
2507                            because we are not allowed to build multicast path
2508                            with loopback source addr (look, routing cache
2509                            cannot know, that ttl is zero, so that packet
2510                            will not leave this host and route is valid).
2511                            Luckily, this hack is good workaround.
2512                          */
2513
2514                         fl4->flowi4_oif = dev_out->ifindex;
2515                         goto make_route;
2516                 }
2517
2518                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2519                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2520                         if (!__ip_dev_find(net, fl4->saddr, false))
2521                                 goto out;
2522                 }
2523         }
2524
2525
2526         if (fl4->flowi4_oif) {
2527                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2528                 rth = ERR_PTR(-ENODEV);
2529                 if (!dev_out)
2530                         goto out;
2531
2532                 /* RACE: Check return value of inet_select_addr instead. */
2533                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2534                         rth = ERR_PTR(-ENETUNREACH);
2535                         goto out;
2536                 }
2537                 if (ipv4_is_local_multicast(fl4->daddr) ||
2538                     ipv4_is_lbcast(fl4->daddr) ||
2539                     fl4->flowi4_proto == IPPROTO_IGMP) {
2540                         if (!fl4->saddr)
2541                                 fl4->saddr = inet_select_addr(dev_out, 0,
2542                                                               RT_SCOPE_LINK);
2543                         goto make_route;
2544                 }
2545                 if (!fl4->saddr) {
2546                         if (ipv4_is_multicast(fl4->daddr))
2547                                 fl4->saddr = inet_select_addr(dev_out, 0,
2548                                                               fl4->flowi4_scope);
2549                         else if (!fl4->daddr)
2550                                 fl4->saddr = inet_select_addr(dev_out, 0,
2551                                                               RT_SCOPE_HOST);
2552                 }
2553         }
2554
2555         if (!fl4->daddr) {
2556                 fl4->daddr = fl4->saddr;
2557                 if (!fl4->daddr)
2558                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2559                 dev_out = net->loopback_dev;
2560                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2561                 res->type = RTN_LOCAL;
2562                 flags |= RTCF_LOCAL;
2563                 goto make_route;
2564         }
2565
2566         err = fib_lookup(net, fl4, res, 0);
2567         if (err) {
2568                 res->fi = NULL;
2569                 res->table = NULL;
2570                 if (fl4->flowi4_oif &&
2571                     (ipv4_is_multicast(fl4->daddr) ||
2572                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2573                         /* Apparently, routing tables are wrong. Assume,
2574                            that the destination is on link.
2575
2576                            WHY? DW.
2577                            Because we are allowed to send to iface
2578                            even if it has NO routes and NO assigned
2579                            addresses. When oif is specified, routing
2580                            tables are looked up with only one purpose:
2581                            to catch if destination is gatewayed, rather than
2582                            direct. Moreover, if MSG_DONTROUTE is set,
2583                            we send packet, ignoring both routing tables
2584                            and ifaddr state. --ANK
2585
2586
2587                            We could make it even if oif is unknown,
2588                            likely IPv6, but we do not.
2589                          */
2590
2591                         if (fl4->saddr == 0)
2592                                 fl4->saddr = inet_select_addr(dev_out, 0,
2593                                                               RT_SCOPE_LINK);
2594                         res->type = RTN_UNICAST;
2595                         goto make_route;
2596                 }
2597                 rth = ERR_PTR(err);
2598                 goto out;
2599         }
2600
2601         if (res->type == RTN_LOCAL) {
2602                 if (!fl4->saddr) {
2603                         if (res->fi->fib_prefsrc)
2604                                 fl4->saddr = res->fi->fib_prefsrc;
2605                         else
2606                                 fl4->saddr = fl4->daddr;
2607                 }
2608
2609                 /* L3 master device is the loopback for that domain */
2610                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2611                         net->loopback_dev;
2612
2613                 /* make sure orig_oif points to fib result device even
2614                  * though packet rx/tx happens over loopback or l3mdev
2615                  */
2616                 orig_oif = FIB_RES_OIF(*res);
2617
2618                 fl4->flowi4_oif = dev_out->ifindex;
2619                 flags |= RTCF_LOCAL;
2620                 goto make_route;
2621         }
2622
2623         fib_select_path(net, res, fl4, skb);
2624
2625         dev_out = FIB_RES_DEV(*res);
2626         fl4->flowi4_oif = dev_out->ifindex;
2627
2628
2629 make_route:
2630         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2631
2632 out:
2633         return rth;
2634 }
2635
2636 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2637 {
2638         return NULL;
2639 }
2640
2641 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2642 {
2643         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2644
2645         return mtu ? : dst->dev->mtu;
2646 }
2647
2648 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2649                                           struct sk_buff *skb, u32 mtu)
2650 {
2651 }
2652
2653 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2654                                        struct sk_buff *skb)
2655 {
2656 }
2657
2658 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2659                                           unsigned long old)
2660 {
2661         return NULL;
2662 }
2663
2664 static struct dst_ops ipv4_dst_blackhole_ops = {
2665         .family                 =       AF_INET,
2666         .check                  =       ipv4_blackhole_dst_check,
2667         .mtu                    =       ipv4_blackhole_mtu,
2668         .default_advmss         =       ipv4_default_advmss,
2669         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2670         .redirect               =       ipv4_rt_blackhole_redirect,
2671         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2672         .neigh_lookup           =       ipv4_neigh_lookup,
2673 };
2674
2675 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2676 {
2677         struct rtable *ort = (struct rtable *) dst_orig;
2678         struct rtable *rt;
2679
2680         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2681         if (rt) {
2682                 struct dst_entry *new = &rt->dst;
2683
2684                 new->__use = 1;
2685                 new->input = dst_discard;
2686                 new->output = dst_discard_out;
2687
2688                 new->dev = net->loopback_dev;
2689                 if (new->dev)
2690                         dev_hold(new->dev);
2691
2692                 rt->rt_is_input = ort->rt_is_input;
2693                 rt->rt_iif = ort->rt_iif;
2694                 rt->rt_pmtu = ort->rt_pmtu;
2695                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2696
2697                 rt->rt_genid = rt_genid_ipv4(net);
2698                 rt->rt_flags = ort->rt_flags;
2699                 rt->rt_type = ort->rt_type;
2700                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2701                 rt->rt_gw_family = ort->rt_gw_family;
2702                 if (rt->rt_gw_family == AF_INET)
2703                         rt->rt_gw4 = ort->rt_gw4;
2704                 else if (rt->rt_gw_family == AF_INET6)
2705                         rt->rt_gw6 = ort->rt_gw6;
2706
2707                 INIT_LIST_HEAD(&rt->rt_uncached);
2708         }
2709
2710         dst_release(dst_orig);
2711
2712         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2713 }
2714
2715 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2716                                     const struct sock *sk)
2717 {
2718         struct rtable *rt = __ip_route_output_key(net, flp4);
2719
2720         if (IS_ERR(rt))
2721                 return rt;
2722
2723         if (flp4->flowi4_proto)
2724                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2725                                                         flowi4_to_flowi(flp4),
2726                                                         sk, 0);
2727
2728         return rt;
2729 }
2730 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2731
2732 /* called with rcu_read_lock held */
2733 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2734                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2735                         struct sk_buff *skb, u32 portid, u32 seq,
2736                         unsigned int flags)
2737 {
2738         struct rtmsg *r;
2739         struct nlmsghdr *nlh;
2740         unsigned long expires = 0;
2741         u32 error;
2742         u32 metrics[RTAX_MAX];
2743
2744         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2745         if (!nlh)
2746                 return -EMSGSIZE;
2747
2748         r = nlmsg_data(nlh);
2749         r->rtm_family    = AF_INET;
2750         r->rtm_dst_len  = 32;
2751         r->rtm_src_len  = 0;
2752         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2753         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2754         if (nla_put_u32(skb, RTA_TABLE, table_id))
2755                 goto nla_put_failure;
2756         r->rtm_type     = rt->rt_type;
2757         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2758         r->rtm_protocol = RTPROT_UNSPEC;
2759         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2760         if (rt->rt_flags & RTCF_NOTIFY)
2761                 r->rtm_flags |= RTM_F_NOTIFY;
2762         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2763                 r->rtm_flags |= RTCF_DOREDIRECT;
2764
2765         if (nla_put_in_addr(skb, RTA_DST, dst))
2766                 goto nla_put_failure;
2767         if (src) {
2768                 r->rtm_src_len = 32;
2769                 if (nla_put_in_addr(skb, RTA_SRC, src))
2770                         goto nla_put_failure;
2771         }
2772         if (rt->dst.dev &&
2773             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2774                 goto nla_put_failure;
2775 #ifdef CONFIG_IP_ROUTE_CLASSID
2776         if (rt->dst.tclassid &&
2777             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2778                 goto nla_put_failure;
2779 #endif
2780         if (fl4 && !rt_is_input_route(rt) &&
2781             fl4->saddr != src) {
2782                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2783                         goto nla_put_failure;
2784         }
2785         if (rt->rt_uses_gateway) {
2786                 if (rt->rt_gw_family == AF_INET &&
2787                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2788                         goto nla_put_failure;
2789                 } else if (rt->rt_gw_family == AF_INET6) {
2790                         int alen = sizeof(struct in6_addr);
2791                         struct nlattr *nla;
2792                         struct rtvia *via;
2793
2794                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2795                         if (!nla)
2796                                 goto nla_put_failure;
2797
2798                         via = nla_data(nla);
2799                         via->rtvia_family = AF_INET6;
2800                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2801                 }
2802         }
2803
2804         expires = rt->dst.expires;
2805         if (expires) {
2806                 unsigned long now = jiffies;
2807
2808                 if (time_before(now, expires))
2809                         expires -= now;
2810                 else
2811                         expires = 0;
2812         }
2813
2814         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2815         if (rt->rt_pmtu && expires)
2816                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2817         if (rt->rt_mtu_locked && expires)
2818                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2819         if (rtnetlink_put_metrics(skb, metrics) < 0)
2820                 goto nla_put_failure;
2821
2822         if (fl4) {
2823                 if (fl4->flowi4_mark &&
2824                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2825                         goto nla_put_failure;
2826
2827                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2828                     nla_put_u32(skb, RTA_UID,
2829                                 from_kuid_munged(current_user_ns(),
2830                                                  fl4->flowi4_uid)))
2831                         goto nla_put_failure;
2832
2833                 if (rt_is_input_route(rt)) {
2834 #ifdef CONFIG_IP_MROUTE
2835                         if (ipv4_is_multicast(dst) &&
2836                             !ipv4_is_local_multicast(dst) &&
2837                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2838                                 int err = ipmr_get_route(net, skb,
2839                                                          fl4->saddr, fl4->daddr,
2840                                                          r, portid);
2841
2842                                 if (err <= 0) {
2843                                         if (err == 0)
2844                                                 return 0;
2845                                         goto nla_put_failure;
2846                                 }
2847                         } else
2848 #endif
2849                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2850                                         goto nla_put_failure;
2851                 }
2852         }
2853
2854         error = rt->dst.error;
2855
2856         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2857                 goto nla_put_failure;
2858
2859         nlmsg_end(skb, nlh);
2860         return 0;
2861
2862 nla_put_failure:
2863         nlmsg_cancel(skb, nlh);
2864         return -EMSGSIZE;
2865 }
2866
2867 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2868                             struct netlink_callback *cb, u32 table_id,
2869                             struct fnhe_hash_bucket *bucket, int genid,
2870                             int *fa_index, int fa_start, unsigned int flags)
2871 {
2872         int i;
2873
2874         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2875                 struct fib_nh_exception *fnhe;
2876
2877                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2878                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2879                         struct rtable *rt;
2880                         int err;
2881
2882                         if (*fa_index < fa_start)
2883                                 goto next;
2884
2885                         if (fnhe->fnhe_genid != genid)
2886                                 goto next;
2887
2888                         if (fnhe->fnhe_expires &&
2889                             time_after(jiffies, fnhe->fnhe_expires))
2890                                 goto next;
2891
2892                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2893                         if (!rt)
2894                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2895                         if (!rt)
2896                                 goto next;
2897
2898                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2899                                            table_id, NULL, skb,
2900                                            NETLINK_CB(cb->skb).portid,
2901                                            cb->nlh->nlmsg_seq, flags);
2902                         if (err)
2903                                 return err;
2904 next:
2905                         (*fa_index)++;
2906                 }
2907         }
2908
2909         return 0;
2910 }
2911
2912 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2913                        u32 table_id, struct fib_info *fi,
2914                        int *fa_index, int fa_start, unsigned int flags)
2915 {
2916         struct net *net = sock_net(cb->skb->sk);
2917         int nhsel, genid = fnhe_genid(net);
2918
2919         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2920                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2921                 struct fnhe_hash_bucket *bucket;
2922                 int err;
2923
2924                 if (nhc->nhc_flags & RTNH_F_DEAD)
2925                         continue;
2926
2927                 rcu_read_lock();
2928                 bucket = rcu_dereference(nhc->nhc_exceptions);
2929                 err = 0;
2930                 if (bucket)
2931                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2932                                                genid, fa_index, fa_start,
2933                                                flags);
2934                 rcu_read_unlock();
2935                 if (err)
2936                         return err;
2937         }
2938
2939         return 0;
2940 }
2941
2942 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2943                                                    u8 ip_proto, __be16 sport,
2944                                                    __be16 dport)
2945 {
2946         struct sk_buff *skb;
2947         struct iphdr *iph;
2948
2949         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2950         if (!skb)
2951                 return NULL;
2952
2953         /* Reserve room for dummy headers, this skb can pass
2954          * through good chunk of routing engine.
2955          */
2956         skb_reset_mac_header(skb);
2957         skb_reset_network_header(skb);
2958         skb->protocol = htons(ETH_P_IP);
2959         iph = skb_put(skb, sizeof(struct iphdr));
2960         iph->protocol = ip_proto;
2961         iph->saddr = src;
2962         iph->daddr = dst;
2963         iph->version = 0x4;
2964         iph->frag_off = 0;
2965         iph->ihl = 0x5;
2966         skb_set_transport_header(skb, skb->len);
2967
2968         switch (iph->protocol) {
2969         case IPPROTO_UDP: {
2970                 struct udphdr *udph;
2971
2972                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2973                 udph->source = sport;
2974                 udph->dest = dport;
2975                 udph->len = sizeof(struct udphdr);
2976                 udph->check = 0;
2977                 break;
2978         }
2979         case IPPROTO_TCP: {
2980                 struct tcphdr *tcph;
2981
2982                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2983                 tcph->source    = sport;
2984                 tcph->dest      = dport;
2985                 tcph->doff      = sizeof(struct tcphdr) / 4;
2986                 tcph->rst = 1;
2987                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2988                                             src, dst, 0);
2989                 break;
2990         }
2991         case IPPROTO_ICMP: {
2992                 struct icmphdr *icmph;
2993
2994                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2995                 icmph->type = ICMP_ECHO;
2996                 icmph->code = 0;
2997         }
2998         }
2999
3000         return skb;
3001 }
3002
3003 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3004                                        const struct nlmsghdr *nlh,
3005                                        struct nlattr **tb,
3006                                        struct netlink_ext_ack *extack)
3007 {
3008         struct rtmsg *rtm;
3009         int i, err;
3010
3011         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3012                 NL_SET_ERR_MSG(extack,
3013                                "ipv4: Invalid header for route get request");
3014                 return -EINVAL;
3015         }
3016
3017         if (!netlink_strict_get_check(skb))
3018                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3019                                               rtm_ipv4_policy, extack);
3020
3021         rtm = nlmsg_data(nlh);
3022         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3023             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3024             rtm->rtm_table || rtm->rtm_protocol ||
3025             rtm->rtm_scope || rtm->rtm_type) {
3026                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3027                 return -EINVAL;
3028         }
3029
3030         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3031                                RTM_F_LOOKUP_TABLE |
3032                                RTM_F_FIB_MATCH)) {
3033                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3034                 return -EINVAL;
3035         }
3036
3037         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3038                                             rtm_ipv4_policy, extack);
3039         if (err)
3040                 return err;
3041
3042         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3043             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3044                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3045                 return -EINVAL;
3046         }
3047
3048         for (i = 0; i <= RTA_MAX; i++) {
3049                 if (!tb[i])
3050                         continue;
3051
3052                 switch (i) {
3053                 case RTA_IIF:
3054                 case RTA_OIF:
3055                 case RTA_SRC:
3056                 case RTA_DST:
3057                 case RTA_IP_PROTO:
3058                 case RTA_SPORT:
3059                 case RTA_DPORT:
3060                 case RTA_MARK:
3061                 case RTA_UID:
3062                         break;
3063                 default:
3064                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3065                         return -EINVAL;
3066                 }
3067         }
3068
3069         return 0;
3070 }
3071
3072 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3073                              struct netlink_ext_ack *extack)
3074 {
3075         struct net *net = sock_net(in_skb->sk);
3076         struct nlattr *tb[RTA_MAX+1];
3077         u32 table_id = RT_TABLE_MAIN;
3078         __be16 sport = 0, dport = 0;
3079         struct fib_result res = {};
3080         u8 ip_proto = IPPROTO_UDP;
3081         struct rtable *rt = NULL;
3082         struct sk_buff *skb;
3083         struct rtmsg *rtm;
3084         struct flowi4 fl4 = {};
3085         __be32 dst = 0;
3086         __be32 src = 0;
3087         kuid_t uid;
3088         u32 iif;
3089         int err;
3090         int mark;
3091
3092         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3093         if (err < 0)
3094                 return err;
3095
3096         rtm = nlmsg_data(nlh);
3097         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3098         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3099         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3100         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3101         if (tb[RTA_UID])
3102                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3103         else
3104                 uid = (iif ? INVALID_UID : current_uid());
3105
3106         if (tb[RTA_IP_PROTO]) {
3107                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3108                                                   &ip_proto, AF_INET, extack);
3109                 if (err)
3110                         return err;
3111         }
3112
3113         if (tb[RTA_SPORT])
3114                 sport = nla_get_be16(tb[RTA_SPORT]);
3115
3116         if (tb[RTA_DPORT])
3117                 dport = nla_get_be16(tb[RTA_DPORT]);
3118
3119         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3120         if (!skb)
3121                 return -ENOBUFS;
3122
3123         fl4.daddr = dst;
3124         fl4.saddr = src;
3125         fl4.flowi4_tos = rtm->rtm_tos;
3126         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3127         fl4.flowi4_mark = mark;
3128         fl4.flowi4_uid = uid;
3129         if (sport)
3130                 fl4.fl4_sport = sport;
3131         if (dport)
3132                 fl4.fl4_dport = dport;
3133         fl4.flowi4_proto = ip_proto;
3134
3135         rcu_read_lock();
3136
3137         if (iif) {
3138                 struct net_device *dev;
3139
3140                 dev = dev_get_by_index_rcu(net, iif);
3141                 if (!dev) {
3142                         err = -ENODEV;
3143                         goto errout_rcu;
3144                 }
3145
3146                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3147                 skb->dev        = dev;
3148                 skb->mark       = mark;
3149                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3150                                          dev, &res);
3151
3152                 rt = skb_rtable(skb);
3153                 if (err == 0 && rt->dst.error)
3154                         err = -rt->dst.error;
3155         } else {
3156                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3157                 skb->dev = net->loopback_dev;
3158                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3159                 err = 0;
3160                 if (IS_ERR(rt))
3161                         err = PTR_ERR(rt);
3162                 else
3163                         skb_dst_set(skb, &rt->dst);
3164         }
3165
3166         if (err)
3167                 goto errout_rcu;
3168
3169         if (rtm->rtm_flags & RTM_F_NOTIFY)
3170                 rt->rt_flags |= RTCF_NOTIFY;
3171
3172         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3173                 table_id = res.table ? res.table->tb_id : 0;
3174
3175         /* reset skb for netlink reply msg */
3176         skb_trim(skb, 0);
3177         skb_reset_network_header(skb);
3178         skb_reset_transport_header(skb);
3179         skb_reset_mac_header(skb);
3180
3181         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3182                 if (!res.fi) {
3183                         err = fib_props[res.type].error;
3184                         if (!err)
3185                                 err = -EHOSTUNREACH;
3186                         goto errout_rcu;
3187                 }
3188                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3189                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3190                                     rt->rt_type, res.prefix, res.prefixlen,
3191                                     fl4.flowi4_tos, res.fi, 0);
3192         } else {
3193                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3194                                    NETLINK_CB(in_skb).portid,
3195                                    nlh->nlmsg_seq, 0);
3196         }
3197         if (err < 0)
3198                 goto errout_rcu;
3199
3200         rcu_read_unlock();
3201
3202         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3203
3204 errout_free:
3205         return err;
3206 errout_rcu:
3207         rcu_read_unlock();
3208         kfree_skb(skb);
3209         goto errout_free;
3210 }
3211
3212 void ip_rt_multicast_event(struct in_device *in_dev)
3213 {
3214         rt_cache_flush(dev_net(in_dev->dev));
3215 }
3216
3217 #ifdef CONFIG_SYSCTL
3218 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3219 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3220 static int ip_rt_gc_elasticity __read_mostly    = 8;
3221 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3222
3223 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3224                                         void __user *buffer,
3225                                         size_t *lenp, loff_t *ppos)
3226 {
3227         struct net *net = (struct net *)__ctl->extra1;
3228
3229         if (write) {
3230                 rt_cache_flush(net);
3231                 fnhe_genid_bump(net);
3232                 return 0;
3233         }
3234
3235         return -EINVAL;
3236 }
3237
3238 static struct ctl_table ipv4_route_table[] = {
3239         {
3240                 .procname       = "gc_thresh",
3241                 .data           = &ipv4_dst_ops.gc_thresh,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "max_size",
3248                 .data           = &ip_rt_max_size,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         {
3254                 /*  Deprecated. Use gc_min_interval_ms */
3255
3256                 .procname       = "gc_min_interval",
3257                 .data           = &ip_rt_gc_min_interval,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec_jiffies,
3261         },
3262         {
3263                 .procname       = "gc_min_interval_ms",
3264                 .data           = &ip_rt_gc_min_interval,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec_ms_jiffies,
3268         },
3269         {
3270                 .procname       = "gc_timeout",
3271                 .data           = &ip_rt_gc_timeout,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec_jiffies,
3275         },
3276         {
3277                 .procname       = "gc_interval",
3278                 .data           = &ip_rt_gc_interval,
3279                 .maxlen         = sizeof(int),
3280                 .mode           = 0644,
3281                 .proc_handler   = proc_dointvec_jiffies,
3282         },
3283         {
3284                 .procname       = "redirect_load",
3285                 .data           = &ip_rt_redirect_load,
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0644,
3288                 .proc_handler   = proc_dointvec,
3289         },
3290         {
3291                 .procname       = "redirect_number",
3292                 .data           = &ip_rt_redirect_number,
3293                 .maxlen         = sizeof(int),
3294                 .mode           = 0644,
3295                 .proc_handler   = proc_dointvec,
3296         },
3297         {
3298                 .procname       = "redirect_silence",
3299                 .data           = &ip_rt_redirect_silence,
3300                 .maxlen         = sizeof(int),
3301                 .mode           = 0644,
3302                 .proc_handler   = proc_dointvec,
3303         },
3304         {
3305                 .procname       = "error_cost",
3306                 .data           = &ip_rt_error_cost,
3307                 .maxlen         = sizeof(int),
3308                 .mode           = 0644,
3309                 .proc_handler   = proc_dointvec,
3310         },
3311         {
3312                 .procname       = "error_burst",
3313                 .data           = &ip_rt_error_burst,
3314                 .maxlen         = sizeof(int),
3315                 .mode           = 0644,
3316                 .proc_handler   = proc_dointvec,
3317         },
3318         {
3319                 .procname       = "gc_elasticity",
3320                 .data           = &ip_rt_gc_elasticity,
3321                 .maxlen         = sizeof(int),
3322                 .mode           = 0644,
3323                 .proc_handler   = proc_dointvec,
3324         },
3325         {
3326                 .procname       = "mtu_expires",
3327                 .data           = &ip_rt_mtu_expires,
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0644,
3330                 .proc_handler   = proc_dointvec_jiffies,
3331         },
3332         {
3333                 .procname       = "min_pmtu",
3334                 .data           = &ip_rt_min_pmtu,
3335                 .maxlen         = sizeof(int),
3336                 .mode           = 0644,
3337                 .proc_handler   = proc_dointvec_minmax,
3338                 .extra1         = &ip_min_valid_pmtu,
3339         },
3340         {
3341                 .procname       = "min_adv_mss",
3342                 .data           = &ip_rt_min_advmss,
3343                 .maxlen         = sizeof(int),
3344                 .mode           = 0644,
3345                 .proc_handler   = proc_dointvec,
3346         },
3347         { }
3348 };
3349
3350 static const char ipv4_route_flush_procname[] = "flush";
3351
3352 static struct ctl_table ipv4_route_flush_table[] = {
3353         {
3354                 .procname       = ipv4_route_flush_procname,
3355                 .maxlen         = sizeof(int),
3356                 .mode           = 0200,
3357                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3358         },
3359         { },
3360 };
3361
3362 static __net_init int sysctl_route_net_init(struct net *net)
3363 {
3364         struct ctl_table *tbl;
3365
3366         tbl = ipv4_route_flush_table;
3367         if (!net_eq(net, &init_net)) {
3368                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3369                 if (!tbl)
3370                         goto err_dup;
3371
3372                 /* Don't export non-whitelisted sysctls to unprivileged users */
3373                 if (net->user_ns != &init_user_ns) {
3374                         if (tbl[0].procname != ipv4_route_flush_procname)
3375                                 tbl[0].procname = NULL;
3376                 }
3377         }
3378         tbl[0].extra1 = net;
3379
3380         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3381         if (!net->ipv4.route_hdr)
3382                 goto err_reg;
3383         return 0;
3384
3385 err_reg:
3386         if (tbl != ipv4_route_flush_table)
3387                 kfree(tbl);
3388 err_dup:
3389         return -ENOMEM;
3390 }
3391
3392 static __net_exit void sysctl_route_net_exit(struct net *net)
3393 {
3394         struct ctl_table *tbl;
3395
3396         tbl = net->ipv4.route_hdr->ctl_table_arg;
3397         unregister_net_sysctl_table(net->ipv4.route_hdr);
3398         BUG_ON(tbl == ipv4_route_flush_table);
3399         kfree(tbl);
3400 }
3401
3402 static __net_initdata struct pernet_operations sysctl_route_ops = {
3403         .init = sysctl_route_net_init,
3404         .exit = sysctl_route_net_exit,
3405 };
3406 #endif
3407
3408 static __net_init int rt_genid_init(struct net *net)
3409 {
3410         atomic_set(&net->ipv4.rt_genid, 0);
3411         atomic_set(&net->fnhe_genid, 0);
3412         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3413         return 0;
3414 }
3415
3416 static __net_initdata struct pernet_operations rt_genid_ops = {
3417         .init = rt_genid_init,
3418 };
3419
3420 static int __net_init ipv4_inetpeer_init(struct net *net)
3421 {
3422         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3423
3424         if (!bp)
3425                 return -ENOMEM;
3426         inet_peer_base_init(bp);
3427         net->ipv4.peers = bp;
3428         return 0;
3429 }
3430
3431 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3432 {
3433         struct inet_peer_base *bp = net->ipv4.peers;
3434
3435         net->ipv4.peers = NULL;
3436         inetpeer_invalidate_tree(bp);
3437         kfree(bp);
3438 }
3439
3440 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3441         .init   =       ipv4_inetpeer_init,
3442         .exit   =       ipv4_inetpeer_exit,
3443 };
3444
3445 #ifdef CONFIG_IP_ROUTE_CLASSID
3446 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3447 #endif /* CONFIG_IP_ROUTE_CLASSID */
3448
3449 int __init ip_rt_init(void)
3450 {
3451         int cpu;
3452
3453         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3454                                   GFP_KERNEL);
3455         if (!ip_idents)
3456                 panic("IP: failed to allocate ip_idents\n");
3457
3458         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3459
3460         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3461         if (!ip_tstamps)
3462                 panic("IP: failed to allocate ip_tstamps\n");
3463
3464         for_each_possible_cpu(cpu) {
3465                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3466
3467                 INIT_LIST_HEAD(&ul->head);
3468                 spin_lock_init(&ul->lock);
3469         }
3470 #ifdef CONFIG_IP_ROUTE_CLASSID
3471         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3472         if (!ip_rt_acct)
3473                 panic("IP: failed to allocate ip_rt_acct\n");
3474 #endif
3475
3476         ipv4_dst_ops.kmem_cachep =
3477                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3478                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3479
3480         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3481
3482         if (dst_entries_init(&ipv4_dst_ops) < 0)
3483                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3484
3485         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3486                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3487
3488         ipv4_dst_ops.gc_thresh = ~0;
3489         ip_rt_max_size = INT_MAX;
3490
3491         devinet_init();
3492         ip_fib_init();
3493
3494         if (ip_rt_proc_init())
3495                 pr_err("Unable to create route proc files\n");
3496 #ifdef CONFIG_XFRM
3497         xfrm_init();
3498         xfrm4_init();
3499 #endif
3500         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3501                       RTNL_FLAG_DOIT_UNLOCKED);
3502
3503 #ifdef CONFIG_SYSCTL
3504         register_pernet_subsys(&sysctl_route_ops);
3505 #endif
3506         register_pernet_subsys(&rt_genid_ops);
3507         register_pernet_subsys(&ipv4_inetpeer_ops);
3508         return 0;
3509 }
3510
3511 #ifdef CONFIG_SYSCTL
3512 /*
3513  * We really need to sanitize the damn ipv4 init order, then all
3514  * this nonsense will go away.
3515  */
3516 void __init ip_static_sysctl_init(void)
3517 {
3518         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3519 }
3520 #endif