net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                         if (tp->write_seq == 0)
 169                                 tp->write_seq = 1;
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         tp->write_seq      = 0;
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295                                                        inet->inet_daddr,
 296                                                        inet->inet_sport,
 297                                                        usin->sin_port);
 298                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299                                                  inet->inet_saddr,
 300                                                  inet->inet_daddr);
 301         }
 302
 303         inet->inet_id = tp->write_seq ^ jiffies;
 304
 305         if (tcp_fastopen_defer_connect(sk, &err))
 306                 return err;
 307         if (err)
 308                 goto failure;
 309
 310         err = tcp_connect(sk);
 311
 312         if (err)
 313                 goto failure;
 314
 315         return 0;
 316
 317 failure:
 318         /*
 319          * This unhashes the socket and releases the local port,
 320          * if necessary.
 321          */
 322         tcp_set_state(sk, TCP_CLOSE);
 323         ip_rt_put(rt);
 324         sk->sk_route_caps = 0;
 325         inet->inet_dport = 0;
 326         return err;
 327 }
 328 EXPORT_SYMBOL(tcp_v4_connect);
 329
 330 /*
 331  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332  * It can be called through tcp_release_cb() if socket was owned by user
 333  * at the time tcp_v4_err() was called to handle ICMP message.
 334  */
 335 void tcp_v4_mtu_reduced(struct sock *sk)
 336 {
 337         struct inet_sock *inet = inet_sk(sk);
 338         struct dst_entry *dst;
 339         u32 mtu;
 340
 341         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342                 return;
 343         mtu = tcp_sk(sk)->mtu_info;
 344         dst = inet_csk_update_pmtu(sk, mtu);
 345         if (!dst)
 346                 return;
 347
 348         /* Something is about to be wrong... Remember soft error
 349          * for the case, if this connection will not able to recover.
 350          */
 351         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352                 sk->sk_err_soft = EMSGSIZE;
 353
 354         mtu = dst_mtu(dst);
 355
 356         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357             ip_sk_accept_pmtu(sk) &&
 358             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359                 tcp_sync_mss(sk, mtu);
 360
 361                 /* Resend the TCP packet because it's
 362                  * clear that the old packet has been
 363                  * dropped. This is the new "fast" path mtu
 364                  * discovery.
 365                  */
 366                 tcp_simple_retransmit(sk);
 367         } /* else let the usual retransmit timer handle it */
 368 }
 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372 {
 373         struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375         if (dst)
 376                 dst->ops->redirect(dst, sk, skb);
 377 }
 378
 379
 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382 {
 383         struct request_sock *req = inet_reqsk(sk);
 384         struct net *net = sock_net(sk);
 385
 386         /* ICMPs are not backlogged, hence we cannot get
 387          * an established socket here.
 388          */
 389         if (seq != tcp_rsk(req)->snt_isn) {
 390                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391         } else if (abort) {
 392                 /*
 393                  * Still in SYN_RECV, just remove it silently.
 394                  * There is no good way to pass the error to the newly
 395                  * created socket, and POSIX does not want network
 396                  * errors returned from accept().
 397                  */
 398                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399                 tcp_listendrop(req->rsk_listener);
 400         }
 401         reqsk_put(req);
 402 }
 403 EXPORT_SYMBOL(tcp_req_err);
 404
 405 /*
 406  * This routine is called by the ICMP module when it gets some
 407  * sort of error condition.  If err < 0 then the socket should
 408  * be closed and the error returned to the user.  If err > 0
 409  * it's just the icmp type << 8 | icmp code.  After adjustment
 410  * header points to the first 8 bytes of the tcp header.  We need
 411  * to find the appropriate port.
 412  *
 413  * The locking strategy used here is very "optimistic". When
 414  * someone else accesses the socket the ICMP is just dropped
 415  * and for some paths there is no check at all.
 416  * A more general error queue to queue errors for later handling
 417  * is probably better.
 418  *
 419  */
 420
 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422 {
 423         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425         struct inet_connection_sock *icsk;
 426         struct tcp_sock *tp;
 427         struct inet_sock *inet;
 428         const int type = icmp_hdr(icmp_skb)->type;
 429         const int code = icmp_hdr(icmp_skb)->code;
 430         struct sock *sk;
 431         struct sk_buff *skb;
 432         struct request_sock *fastopen;
 433         u32 seq, snd_una;
 434         s32 remaining;
 435         u32 delta_us;
 436         int err;
 437         struct net *net = dev_net(icmp_skb->dev);
 438
 439         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440                                        th->dest, iph->saddr, ntohs(th->source),
 441                                        inet_iif(icmp_skb), 0);
 442         if (!sk) {
 443                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444                 return -ENOENT;
 445         }
 446         if (sk->sk_state == TCP_TIME_WAIT) {
 447                 inet_twsk_put(inet_twsk(sk));
 448                 return 0;
 449         }
 450         seq = ntohl(th->seq);
 451         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453                                      type == ICMP_TIME_EXCEEDED ||
 454                                      (type == ICMP_DEST_UNREACH &&
 455                                       (code == ICMP_NET_UNREACH ||
 456                                        code == ICMP_HOST_UNREACH)));
 457                 return 0;
 458         }
 459
 460         bh_lock_sock(sk);
 461         /* If too many ICMPs get dropped on busy
 462          * servers this needs to be solved differently.
 463          * We do take care of PMTU discovery (RFC1191) special case :
 464          * we can receive locally generated ICMP messages while socket is held.
 465          */
 466         if (sock_owned_by_user(sk)) {
 467                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469         }
 470         if (sk->sk_state == TCP_CLOSE)
 471                 goto out;
 472
 473         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475                 goto out;
 476         }
 477
 478         icsk = inet_csk(sk);
 479         tp = tcp_sk(sk);
 480         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481         fastopen = tp->fastopen_rsk;
 482         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483         if (sk->sk_state != TCP_LISTEN &&
 484             !between(seq, snd_una, tp->snd_nxt)) {
 485                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486                 goto out;
 487         }
 488
 489         switch (type) {
 490         case ICMP_REDIRECT:
 491                 if (!sock_owned_by_user(sk))
 492                         do_redirect(icmp_skb, sk);
 493                 goto out;
 494         case ICMP_SOURCE_QUENCH:
 495                 /* Just silently ignore these. */
 496                 goto out;
 497         case ICMP_PARAMETERPROB:
 498                 err = EPROTO;
 499                 break;
 500         case ICMP_DEST_UNREACH:
 501                 if (code > NR_ICMP_UNREACH)
 502                         goto out;
 503
 504                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505                         /* We are not interested in TCP_LISTEN and open_requests
 506                          * (SYN-ACKs send out by Linux are always <576bytes so
 507                          * they should go through unfragmented).
 508                          */
 509                         if (sk->sk_state == TCP_LISTEN)
 510                                 goto out;
 511
 512                         tp->mtu_info = info;
 513                         if (!sock_owned_by_user(sk)) {
 514                                 tcp_v4_mtu_reduced(sk);
 515                         } else {
 516                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517                                         sock_hold(sk);
 518                         }
 519                         goto out;
 520                 }
 521
 522                 err = icmp_err_convert[code].errno;
 523                 /* check if icmp_skb allows revert of backoff
 524                  * (see draft-zimmermann-tcp-lcd) */
 525                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526                         break;
 527                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528                     !icsk->icsk_backoff || fastopen)
 529                         break;
 530
 531                 if (sock_owned_by_user(sk))
 532                         break;
 533
 534                 skb = tcp_rtx_queue_head(sk);
 535                 if (WARN_ON_ONCE(!skb))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544                 tcp_mstamp_refresh(tp);
 545                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546                 remaining = icsk->icsk_rto -
 547                             usecs_to_jiffies(delta_us);
 548
 549                 if (remaining > 0) {
 550                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551                                                   remaining, TCP_RTO_MAX);
 552                 } else {
 553                         /* RTO revert clocked out retransmission.
 554                          * Will retransmit now */
 555                         tcp_retransmit_timer(sk);
 556                 }
 557
 558                 break;
 559         case ICMP_TIME_EXCEEDED:
 560                 err = EHOSTUNREACH;
 561                 break;
 562         default:
 563                 goto out;
 564         }
 565
 566         switch (sk->sk_state) {
 567         case TCP_SYN_SENT:
 568         case TCP_SYN_RECV:
 569                 /* Only in fast or simultaneous open. If a fast open socket is
 570                  * is already accepted it is treated as a connected one below.
 571                  */
 572                 if (fastopen && !fastopen->sk)
 573                         break;
 574
 575                 if (!sock_owned_by_user(sk)) {
 576                         sk->sk_err = err;
 577
 578                         sk->sk_error_report(sk);
 579
 580                         tcp_done(sk);
 581                 } else {
 582                         sk->sk_err_soft = err;
 583                 }
 584                 goto out;
 585         }
 586
 587         /* If we've already connected we will keep trying
 588          * until we time out, or the user gives up.
 589          *
 590          * rfc1122 4.2.3.9 allows to consider as hard errors
 591          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592          * but it is obsoleted by pmtu discovery).
 593          *
 594          * Note, that in modern internet, where routing is unreliable
 595          * and in each dark corner broken firewalls sit, sending random
 596          * errors ordered by their masters even this two messages finally lose
 597          * their original sense (even Linux sends invalid PORT_UNREACHs)
 598          *
 599          * Now we are in compliance with RFCs.
 600          *                                                      --ANK (980905)
 601          */
 602
 603         inet = inet_sk(sk);
 604         if (!sock_owned_by_user(sk) && inet->recverr) {
 605                 sk->sk_err = err;
 606                 sk->sk_error_report(sk);
 607         } else  { /* Only an error on timeout */
 608                 sk->sk_err_soft = err;
 609         }
 610
 611 out:
 612         bh_unlock_sock(sk);
 613         sock_put(sk);
 614         return 0;
 615 }
 616
 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618 {
 619         struct tcphdr *th = tcp_hdr(skb);
 620
 621         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622         skb->csum_start = skb_transport_header(skb) - skb->head;
 623         skb->csum_offset = offsetof(struct tcphdr, check);
 624 }
 625
 626 /* This routine computes an IPv4 TCP checksum. */
 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628 {
 629         const struct inet_sock *inet = inet_sk(sk);
 630
 631         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632 }
 633 EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635 /*
 636  *      This routine will send an RST to the other tcp.
 637  *
 638  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639  *                    for reset.
 640  *      Answer: if a packet caused RST, it is not for a socket
 641  *              existing in our system, if it is matched to a socket,
 642  *              it is just duplicate segment or bug in other side's TCP.
 643  *              So that we build reply only basing on parameters
 644  *              arrived with segment.
 645  *      Exception: precedence violation. We do not implement it in any case.
 646  */
 647
 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649 {
 650         const struct tcphdr *th = tcp_hdr(skb);
 651         struct {
 652                 struct tcphdr th;
 653 #ifdef CONFIG_TCP_MD5SIG
 654                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655 #endif
 656         } rep;
 657         struct ip_reply_arg arg;
 658 #ifdef CONFIG_TCP_MD5SIG
 659         struct tcp_md5sig_key *key = NULL;
 660         const __u8 *hash_location = NULL;
 661         unsigned char newhash[16];
 662         int genhash;
 663         struct sock *sk1 = NULL;
 664 #endif
 665         struct net *net;
 666         struct sock *ctl_sk;
 667
 668         /* Never send a reset in response to a reset. */
 669         if (th->rst)
 670                 return;
 671
 672         /* If sk not NULL, it means we did a successful lookup and incoming
 673          * route had to be correct. prequeue might have dropped our dst.
 674          */
 675         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 676                 return;
 677
 678         /* Swap the send and the receive. */
 679         memset(&rep, 0, sizeof(rep));
 680         rep.th.dest   = th->source;
 681         rep.th.source = th->dest;
 682         rep.th.doff   = sizeof(struct tcphdr) / 4;
 683         rep.th.rst    = 1;
 684
 685         if (th->ack) {
 686                 rep.th.seq = th->ack_seq;
 687         } else {
 688                 rep.th.ack = 1;
 689                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 690                                        skb->len - (th->doff << 2));
 691         }
 692
 693         memset(&arg, 0, sizeof(arg));
 694         arg.iov[0].iov_base = (unsigned char *)&rep;
 695         arg.iov[0].iov_len  = sizeof(rep.th);
 696
 697         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 698 #ifdef CONFIG_TCP_MD5SIG
 699         rcu_read_lock();
 700         hash_location = tcp_parse_md5sig_option(th);
 701         if (sk && sk_fullsock(sk)) {
 702                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 703                                         &ip_hdr(skb)->saddr, AF_INET);
 704         } else if (hash_location) {
 705                 /*
 706                  * active side is lost. Try to find listening socket through
 707                  * source port, and then find md5 key through listening socket.
 708                  * we are not loose security here:
 709                  * Incoming packet is checked with md5 hash with finding key,
 710                  * no RST generated if md5 hash doesn't match.
 711                  */
 712                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 713                                              ip_hdr(skb)->saddr,
 714                                              th->source, ip_hdr(skb)->daddr,
 715                                              ntohs(th->source), inet_iif(skb),
 716                                              tcp_v4_sdif(skb));
 717                 /* don't send rst if it can't find key */
 718                 if (!sk1)
 719                         goto out;
 720
 721                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 722                                         &ip_hdr(skb)->saddr, AF_INET);
 723                 if (!key)
 724                         goto out;
 725
 726
 727                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 728                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 729                         goto out;
 730
 731         }
 732
 733         if (key) {
 734                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 735                                    (TCPOPT_NOP << 16) |
 736                                    (TCPOPT_MD5SIG << 8) |
 737                                    TCPOLEN_MD5SIG);
 738                 /* Update length and the length the header thinks exists */
 739                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 740                 rep.th.doff = arg.iov[0].iov_len / 4;
 741
 742                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 743                                      key, ip_hdr(skb)->saddr,
 744                                      ip_hdr(skb)->daddr, &rep.th);
 745         }
 746 #endif
 747         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 748                                       ip_hdr(skb)->saddr, /* XXX */
 749                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 750         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 751         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 752
 753         /* When socket is gone, all binding information is lost.
 754          * routing might fail in this case. No choice here, if we choose to force
 755          * input interface, we will misroute in case of asymmetric route.
 756          */
 757         if (sk) {
 758                 arg.bound_dev_if = sk->sk_bound_dev_if;
 759                 if (sk_fullsock(sk))
 760                         trace_tcp_send_reset(sk, skb);
 761         }
 762
 763         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 764                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 765
 766         arg.tos = ip_hdr(skb)->tos;
 767         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 768         local_bh_disable();
 769         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 770         if (sk)
 771                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 772                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 773         ip_send_unicast_reply(ctl_sk,
 774                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 775                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 776                               &arg, arg.iov[0].iov_len);
 777
 778         ctl_sk->sk_mark = 0;
 779         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 780         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 781         local_bh_enable();
 782
 783 #ifdef CONFIG_TCP_MD5SIG
 784 out:
 785         rcu_read_unlock();
 786 #endif
 787 }
 788
 789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 790    outside socket context is ugly, certainly. What can I do?
 791  */
 792
 793 static void tcp_v4_send_ack(const struct sock *sk,
 794                             struct sk_buff *skb, u32 seq, u32 ack,
 795                             u32 win, u32 tsval, u32 tsecr, int oif,
 796                             struct tcp_md5sig_key *key,
 797                             int reply_flags, u8 tos)
 798 {
 799         const struct tcphdr *th = tcp_hdr(skb);
 800         struct {
 801                 struct tcphdr th;
 802                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 803 #ifdef CONFIG_TCP_MD5SIG
 804                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 805 #endif
 806                         ];
 807         } rep;
 808         struct net *net = sock_net(sk);
 809         struct ip_reply_arg arg;
 810         struct sock *ctl_sk;
 811
 812         memset(&rep.th, 0, sizeof(struct tcphdr));
 813         memset(&arg, 0, sizeof(arg));
 814
 815         arg.iov[0].iov_base = (unsigned char *)&rep;
 816         arg.iov[0].iov_len  = sizeof(rep.th);
 817         if (tsecr) {
 818                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 819                                    (TCPOPT_TIMESTAMP << 8) |
 820                                    TCPOLEN_TIMESTAMP);
 821                 rep.opt[1] = htonl(tsval);
 822                 rep.opt[2] = htonl(tsecr);
 823                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 824         }
 825
 826         /* Swap the send and the receive. */
 827         rep.th.dest    = th->source;
 828         rep.th.source  = th->dest;
 829         rep.th.doff    = arg.iov[0].iov_len / 4;
 830         rep.th.seq     = htonl(seq);
 831         rep.th.ack_seq = htonl(ack);
 832         rep.th.ack     = 1;
 833         rep.th.window  = htons(win);
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836         if (key) {
 837                 int offset = (tsecr) ? 3 : 0;
 838
 839                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 840                                           (TCPOPT_NOP << 16) |
 841                                           (TCPOPT_MD5SIG << 8) |
 842                                           TCPOLEN_MD5SIG);
 843                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 844                 rep.th.doff = arg.iov[0].iov_len/4;
 845
 846                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 847                                     key, ip_hdr(skb)->saddr,
 848                                     ip_hdr(skb)->daddr, &rep.th);
 849         }
 850 #endif
 851         arg.flags = reply_flags;
 852         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 853                                       ip_hdr(skb)->saddr, /* XXX */
 854                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 855         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 856         if (oif)
 857                 arg.bound_dev_if = oif;
 858         arg.tos = tos;
 859         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 860         local_bh_disable();
 861         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 862         if (sk)
 863                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 864                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 865         ip_send_unicast_reply(ctl_sk,
 866                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 867                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 868                               &arg, arg.iov[0].iov_len);
 869
 870         ctl_sk->sk_mark = 0;
 871         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 872         local_bh_enable();
 873 }
 874
 875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 876 {
 877         struct inet_timewait_sock *tw = inet_twsk(sk);
 878         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 879
 880         tcp_v4_send_ack(sk, skb,
 881                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 882                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 883                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 884                         tcptw->tw_ts_recent,
 885                         tw->tw_bound_dev_if,
 886                         tcp_twsk_md5_key(tcptw),
 887                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 888                         tw->tw_tos
 889                         );
 890
 891         inet_twsk_put(tw);
 892 }
 893
 894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 895                                   struct request_sock *req)
 896 {
 897         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 898          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 899          */
 900         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 901                                              tcp_sk(sk)->snd_nxt;
 902
 903         /* RFC 7323 2.3
 904          * The window field (SEG.WND) of every outgoing segment, with the
 905          * exception of <SYN> segments, MUST be right-shifted by
 906          * Rcv.Wind.Shift bits:
 907          */
 908         tcp_v4_send_ack(sk, skb, seq,
 909                         tcp_rsk(req)->rcv_nxt,
 910                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 911                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 912                         req->ts_recent,
 913                         0,
 914                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 915                                           AF_INET),
 916                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 917                         ip_hdr(skb)->tos);
 918 }
 919
 920 /*
 921  *      Send a SYN-ACK after having received a SYN.
 922  *      This still operates on a request_sock only, not on a big
 923  *      socket.
 924  */
 925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 926                               struct flowi *fl,
 927                               struct request_sock *req,
 928                               struct tcp_fastopen_cookie *foc,
 929                               enum tcp_synack_type synack_type)
 930 {
 931         const struct inet_request_sock *ireq = inet_rsk(req);
 932         struct flowi4 fl4;
 933         int err = -1;
 934         struct sk_buff *skb;
 935
 936         /* First, grab a route. */
 937         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 938                 return -1;
 939
 940         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 941
 942         if (skb) {
 943                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 944
 945                 rcu_read_lock();
 946                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 947                                             ireq->ir_rmt_addr,
 948                                             rcu_dereference(ireq->ireq_opt));
 949                 rcu_read_unlock();
 950                 err = net_xmit_eval(err);
 951         }
 952
 953         return err;
 954 }
 955
 956 /*
 957  *      IPv4 request_sock destructor.
 958  */
 959 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 960 {
 961         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 962 }
 963
 964 #ifdef CONFIG_TCP_MD5SIG
 965 /*
 966  * RFC2385 MD5 checksumming requires a mapping of
 967  * IP address->MD5 Key.
 968  * We need to maintain these in the sk structure.
 969  */
 970
 971 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 972 EXPORT_SYMBOL(tcp_md5_needed);
 973
 974 /* Find the Key structure for an address.  */
 975 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 976                                            const union tcp_md5_addr *addr,
 977                                            int family)
 978 {
 979         const struct tcp_sock *tp = tcp_sk(sk);
 980         struct tcp_md5sig_key *key;
 981         const struct tcp_md5sig_info *md5sig;
 982         __be32 mask;
 983         struct tcp_md5sig_key *best_match = NULL;
 984         bool match;
 985
 986         /* caller either holds rcu_read_lock() or socket lock */
 987         md5sig = rcu_dereference_check(tp->md5sig_info,
 988                                        lockdep_sock_is_held(sk));
 989         if (!md5sig)
 990                 return NULL;
 991
 992         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 993                 if (key->family != family)
 994                         continue;
 995
 996                 if (family == AF_INET) {
 997                         mask = inet_make_mask(key->prefixlen);
 998                         match = (key->addr.a4.s_addr & mask) ==
 999                                 (addr->a4.s_addr & mask);
1000 #if IS_ENABLED(CONFIG_IPV6)
1001                 } else if (family == AF_INET6) {
1002                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1003                                                   key->prefixlen);
1004 #endif
1005                 } else {
1006                         match = false;
1007                 }
1008
1009                 if (match && (!best_match ||
1010                               key->prefixlen > best_match->prefixlen))
1011                         best_match = key;
1012         }
1013         return best_match;
1014 }
1015 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1016
1017 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1018                                                       const union tcp_md5_addr *addr,
1019                                                       int family, u8 prefixlen)
1020 {
1021         const struct tcp_sock *tp = tcp_sk(sk);
1022         struct tcp_md5sig_key *key;
1023         unsigned int size = sizeof(struct in_addr);
1024         const struct tcp_md5sig_info *md5sig;
1025
1026         /* caller either holds rcu_read_lock() or socket lock */
1027         md5sig = rcu_dereference_check(tp->md5sig_info,
1028                                        lockdep_sock_is_held(sk));
1029         if (!md5sig)
1030                 return NULL;
1031 #if IS_ENABLED(CONFIG_IPV6)
1032         if (family == AF_INET6)
1033                 size = sizeof(struct in6_addr);
1034 #endif
1035         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1036                 if (key->family != family)
1037                         continue;
1038                 if (!memcmp(&key->addr, addr, size) &&
1039                     key->prefixlen == prefixlen)
1040                         return key;
1041         }
1042         return NULL;
1043 }
1044
1045 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1046                                          const struct sock *addr_sk)
1047 {
1048         const union tcp_md5_addr *addr;
1049
1050         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1051         return tcp_md5_do_lookup(sk, addr, AF_INET);
1052 }
1053 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1054
1055 /* This can be called on a newly created socket, from other files */
1056 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1057                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1058                    gfp_t gfp)
1059 {
1060         /* Add Key to the list */
1061         struct tcp_md5sig_key *key;
1062         struct tcp_sock *tp = tcp_sk(sk);
1063         struct tcp_md5sig_info *md5sig;
1064
1065         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1066         if (key) {
1067                 /* Pre-existing entry - just update that one. */
1068                 memcpy(key->key, newkey, newkeylen);
1069                 key->keylen = newkeylen;
1070                 return 0;
1071         }
1072
1073         md5sig = rcu_dereference_protected(tp->md5sig_info,
1074                                            lockdep_sock_is_held(sk));
1075         if (!md5sig) {
1076                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1077                 if (!md5sig)
1078                         return -ENOMEM;
1079
1080                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1081                 INIT_HLIST_HEAD(&md5sig->head);
1082                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1083         }
1084
1085         key = sock_kmalloc(sk, sizeof(*key), gfp);
1086         if (!key)
1087                 return -ENOMEM;
1088         if (!tcp_alloc_md5sig_pool()) {
1089                 sock_kfree_s(sk, key, sizeof(*key));
1090                 return -ENOMEM;
1091         }
1092
1093         memcpy(key->key, newkey, newkeylen);
1094         key->keylen = newkeylen;
1095         key->family = family;
1096         key->prefixlen = prefixlen;
1097         memcpy(&key->addr, addr,
1098                (family == AF_INET6) ? sizeof(struct in6_addr) :
1099                                       sizeof(struct in_addr));
1100         hlist_add_head_rcu(&key->node, &md5sig->head);
1101         return 0;
1102 }
1103 EXPORT_SYMBOL(tcp_md5_do_add);
1104
1105 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1106                    u8 prefixlen)
1107 {
1108         struct tcp_md5sig_key *key;
1109
1110         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1111         if (!key)
1112                 return -ENOENT;
1113         hlist_del_rcu(&key->node);
1114         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1115         kfree_rcu(key, rcu);
1116         return 0;
1117 }
1118 EXPORT_SYMBOL(tcp_md5_do_del);
1119
1120 static void tcp_clear_md5_list(struct sock *sk)
1121 {
1122         struct tcp_sock *tp = tcp_sk(sk);
1123         struct tcp_md5sig_key *key;
1124         struct hlist_node *n;
1125         struct tcp_md5sig_info *md5sig;
1126
1127         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1128
1129         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1130                 hlist_del_rcu(&key->node);
1131                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1132                 kfree_rcu(key, rcu);
1133         }
1134 }
1135
1136 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1137                                  char __user *optval, int optlen)
1138 {
1139         struct tcp_md5sig cmd;
1140         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1141         u8 prefixlen = 32;
1142
1143         if (optlen < sizeof(cmd))
1144                 return -EINVAL;
1145
1146         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1147                 return -EFAULT;
1148
1149         if (sin->sin_family != AF_INET)
1150                 return -EINVAL;
1151
1152         if (optname == TCP_MD5SIG_EXT &&
1153             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1154                 prefixlen = cmd.tcpm_prefixlen;
1155                 if (prefixlen > 32)
1156                         return -EINVAL;
1157         }
1158
1159         if (!cmd.tcpm_keylen)
1160                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1161                                       AF_INET, prefixlen);
1162
1163         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1164                 return -EINVAL;
1165
1166         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1168                               GFP_KERNEL);
1169 }
1170
1171 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1172                                    __be32 daddr, __be32 saddr,
1173                                    const struct tcphdr *th, int nbytes)
1174 {
1175         struct tcp4_pseudohdr *bp;
1176         struct scatterlist sg;
1177         struct tcphdr *_th;
1178
1179         bp = hp->scratch;
1180         bp->saddr = saddr;
1181         bp->daddr = daddr;
1182         bp->pad = 0;
1183         bp->protocol = IPPROTO_TCP;
1184         bp->len = cpu_to_be16(nbytes);
1185
1186         _th = (struct tcphdr *)(bp + 1);
1187         memcpy(_th, th, sizeof(*th));
1188         _th->check = 0;
1189
1190         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1191         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1192                                 sizeof(*bp) + sizeof(*th));
1193         return crypto_ahash_update(hp->md5_req);
1194 }
1195
1196 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1197                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1198 {
1199         struct tcp_md5sig_pool *hp;
1200         struct ahash_request *req;
1201
1202         hp = tcp_get_md5sig_pool();
1203         if (!hp)
1204                 goto clear_hash_noput;
1205         req = hp->md5_req;
1206
1207         if (crypto_ahash_init(req))
1208                 goto clear_hash;
1209         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1210                 goto clear_hash;
1211         if (tcp_md5_hash_key(hp, key))
1212                 goto clear_hash;
1213         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1214         if (crypto_ahash_final(req))
1215                 goto clear_hash;
1216
1217         tcp_put_md5sig_pool();
1218         return 0;
1219
1220 clear_hash:
1221         tcp_put_md5sig_pool();
1222 clear_hash_noput:
1223         memset(md5_hash, 0, 16);
1224         return 1;
1225 }
1226
1227 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1228                         const struct sock *sk,
1229                         const struct sk_buff *skb)
1230 {
1231         struct tcp_md5sig_pool *hp;
1232         struct ahash_request *req;
1233         const struct tcphdr *th = tcp_hdr(skb);
1234         __be32 saddr, daddr;
1235
1236         if (sk) { /* valid for establish/request sockets */
1237                 saddr = sk->sk_rcv_saddr;
1238                 daddr = sk->sk_daddr;
1239         } else {
1240                 const struct iphdr *iph = ip_hdr(skb);
1241                 saddr = iph->saddr;
1242                 daddr = iph->daddr;
1243         }
1244
1245         hp = tcp_get_md5sig_pool();
1246         if (!hp)
1247                 goto clear_hash_noput;
1248         req = hp->md5_req;
1249
1250         if (crypto_ahash_init(req))
1251                 goto clear_hash;
1252
1253         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1254                 goto clear_hash;
1255         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1256                 goto clear_hash;
1257         if (tcp_md5_hash_key(hp, key))
1258                 goto clear_hash;
1259         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1260         if (crypto_ahash_final(req))
1261                 goto clear_hash;
1262
1263         tcp_put_md5sig_pool();
1264         return 0;
1265
1266 clear_hash:
1267         tcp_put_md5sig_pool();
1268 clear_hash_noput:
1269         memset(md5_hash, 0, 16);
1270         return 1;
1271 }
1272 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1273
1274 #endif
1275
1276 /* Called with rcu_read_lock() */
1277 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1278                                     const struct sk_buff *skb)
1279 {
1280 #ifdef CONFIG_TCP_MD5SIG
1281         /*
1282          * This gets called for each TCP segment that arrives
1283          * so we want to be efficient.
1284          * We have 3 drop cases:
1285          * o No MD5 hash and one expected.
1286          * o MD5 hash and we're not expecting one.
1287          * o MD5 hash and its wrong.
1288          */
1289         const __u8 *hash_location = NULL;
1290         struct tcp_md5sig_key *hash_expected;
1291         const struct iphdr *iph = ip_hdr(skb);
1292         const struct tcphdr *th = tcp_hdr(skb);
1293         int genhash;
1294         unsigned char newhash[16];
1295
1296         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1297                                           AF_INET);
1298         hash_location = tcp_parse_md5sig_option(th);
1299
1300         /* We've parsed the options - do we have a hash? */
1301         if (!hash_expected && !hash_location)
1302                 return false;
1303
1304         if (hash_expected && !hash_location) {
1305                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1306                 return true;
1307         }
1308
1309         if (!hash_expected && hash_location) {
1310                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1311                 return true;
1312         }
1313
1314         /* Okay, so this is hash_expected and hash_location -
1315          * so we need to calculate the checksum.
1316          */
1317         genhash = tcp_v4_md5_hash_skb(newhash,
1318                                       hash_expected,
1319                                       NULL, skb);
1320
1321         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1322                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1323                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1324                                      &iph->saddr, ntohs(th->source),
1325                                      &iph->daddr, ntohs(th->dest),
1326                                      genhash ? " tcp_v4_calc_md5_hash failed"
1327                                      : "");
1328                 return true;
1329         }
1330         return false;
1331 #endif
1332         return false;
1333 }
1334
1335 static void tcp_v4_init_req(struct request_sock *req,
1336                             const struct sock *sk_listener,
1337                             struct sk_buff *skb)
1338 {
1339         struct inet_request_sock *ireq = inet_rsk(req);
1340         struct net *net = sock_net(sk_listener);
1341
1342         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1343         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1344         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1345 }
1346
1347 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1348                                           struct flowi *fl,
1349                                           const struct request_sock *req)
1350 {
1351         return inet_csk_route_req(sk, &fl->u.ip4, req);
1352 }
1353
1354 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1355         .family         =       PF_INET,
1356         .obj_size       =       sizeof(struct tcp_request_sock),
1357         .rtx_syn_ack    =       tcp_rtx_synack,
1358         .send_ack       =       tcp_v4_reqsk_send_ack,
1359         .destructor     =       tcp_v4_reqsk_destructor,
1360         .send_reset     =       tcp_v4_send_reset,
1361         .syn_ack_timeout =      tcp_syn_ack_timeout,
1362 };
1363
1364 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1365         .mss_clamp      =       TCP_MSS_DEFAULT,
1366 #ifdef CONFIG_TCP_MD5SIG
1367         .req_md5_lookup =       tcp_v4_md5_lookup,
1368         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1369 #endif
1370         .init_req       =       tcp_v4_init_req,
1371 #ifdef CONFIG_SYN_COOKIES
1372         .cookie_init_seq =      cookie_v4_init_sequence,
1373 #endif
1374         .route_req      =       tcp_v4_route_req,
1375         .init_seq       =       tcp_v4_init_seq,
1376         .init_ts_off    =       tcp_v4_init_ts_off,
1377         .send_synack    =       tcp_v4_send_synack,
1378 };
1379
1380 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1381 {
1382         /* Never answer to SYNs send to broadcast or multicast */
1383         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1384                 goto drop;
1385
1386         return tcp_conn_request(&tcp_request_sock_ops,
1387                                 &tcp_request_sock_ipv4_ops, sk, skb);
1388
1389 drop:
1390         tcp_listendrop(sk);
1391         return 0;
1392 }
1393 EXPORT_SYMBOL(tcp_v4_conn_request);
1394
1395
1396 /*
1397  * The three way handshake has completed - we got a valid synack -
1398  * now create the new socket.
1399  */
1400 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1401                                   struct request_sock *req,
1402                                   struct dst_entry *dst,
1403                                   struct request_sock *req_unhash,
1404                                   bool *own_req)
1405 {
1406         struct inet_request_sock *ireq;
1407         struct inet_sock *newinet;
1408         struct tcp_sock *newtp;
1409         struct sock *newsk;
1410 #ifdef CONFIG_TCP_MD5SIG
1411         struct tcp_md5sig_key *key;
1412 #endif
1413         struct ip_options_rcu *inet_opt;
1414
1415         if (sk_acceptq_is_full(sk))
1416                 goto exit_overflow;
1417
1418         newsk = tcp_create_openreq_child(sk, req, skb);
1419         if (!newsk)
1420                 goto exit_nonewsk;
1421
1422         newsk->sk_gso_type = SKB_GSO_TCPV4;
1423         inet_sk_rx_dst_set(newsk, skb);
1424
1425         newtp                 = tcp_sk(newsk);
1426         newinet               = inet_sk(newsk);
1427         ireq                  = inet_rsk(req);
1428         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1429         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1430         newsk->sk_bound_dev_if = ireq->ir_iif;
1431         newinet->inet_saddr   = ireq->ir_loc_addr;
1432         inet_opt              = rcu_dereference(ireq->ireq_opt);
1433         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1434         newinet->mc_index     = inet_iif(skb);
1435         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1436         newinet->rcv_tos      = ip_hdr(skb)->tos;
1437         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438         if (inet_opt)
1439                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1440         newinet->inet_id = newtp->write_seq ^ jiffies;
1441
1442         if (!dst) {
1443                 dst = inet_csk_route_child_sock(sk, newsk, req);
1444                 if (!dst)
1445                         goto put_and_exit;
1446         } else {
1447                 /* syncookie case : see end of cookie_v4_check() */
1448         }
1449         sk_setup_caps(newsk, dst);
1450
1451         tcp_ca_openreq_child(newsk, dst);
1452
1453         tcp_sync_mss(newsk, dst_mtu(dst));
1454         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1455
1456         tcp_initialize_rcv_mss(newsk);
1457
1458 #ifdef CONFIG_TCP_MD5SIG
1459         /* Copy over the MD5 key from the original socket */
1460         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1461                                 AF_INET);
1462         if (key) {
1463                 /*
1464                  * We're using one, so create a matching key
1465                  * on the newsk structure. If we fail to get
1466                  * memory, then we end up not copying the key
1467                  * across. Shucks.
1468                  */
1469                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1470                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1471                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1472         }
1473 #endif
1474
1475         if (__inet_inherit_port(sk, newsk) < 0)
1476                 goto put_and_exit;
1477         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1478         if (likely(*own_req)) {
1479                 tcp_move_syn(newtp, req);
1480                 ireq->ireq_opt = NULL;
1481         } else {
1482                 newinet->inet_opt = NULL;
1483         }
1484         return newsk;
1485
1486 exit_overflow:
1487         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489         dst_release(dst);
1490 exit:
1491         tcp_listendrop(sk);
1492         return NULL;
1493 put_and_exit:
1494         newinet->inet_opt = NULL;
1495         inet_csk_prepare_forced_close(newsk);
1496         tcp_done(newsk);
1497         goto exit;
1498 }
1499 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1500
1501 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1502 {
1503 #ifdef CONFIG_SYN_COOKIES
1504         const struct tcphdr *th = tcp_hdr(skb);
1505
1506         if (!th->syn)
1507                 sk = cookie_v4_check(sk, skb);
1508 #endif
1509         return sk;
1510 }
1511
1512 /* The socket must have it's spinlock held when we get
1513  * here, unless it is a TCP_LISTEN socket.
1514  *
1515  * We have a potential double-lock case here, so even when
1516  * doing backlog processing we use the BH locking scheme.
1517  * This is because we cannot sleep with the original spinlock
1518  * held.
1519  */
1520 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1521 {
1522         struct sock *rsk;
1523
1524         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1525                 struct dst_entry *dst = sk->sk_rx_dst;
1526
1527                 sock_rps_save_rxhash(sk, skb);
1528                 sk_mark_napi_id(sk, skb);
1529                 if (dst) {
1530                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1531                             !dst->ops->check(dst, 0)) {
1532                                 dst_release(dst);
1533                                 sk->sk_rx_dst = NULL;
1534                         }
1535                 }
1536                 tcp_rcv_established(sk, skb);
1537                 return 0;
1538         }
1539
1540         if (tcp_checksum_complete(skb))
1541                 goto csum_err;
1542
1543         if (sk->sk_state == TCP_LISTEN) {
1544                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1545
1546                 if (!nsk)
1547                         goto discard;
1548                 if (nsk != sk) {
1549                         if (tcp_child_process(sk, nsk, skb)) {
1550                                 rsk = nsk;
1551                                 goto reset;
1552                         }
1553                         return 0;
1554                 }
1555         } else
1556                 sock_rps_save_rxhash(sk, skb);
1557
1558         if (tcp_rcv_state_process(sk, skb)) {
1559                 rsk = sk;
1560                 goto reset;
1561         }
1562         return 0;
1563
1564 reset:
1565         tcp_v4_send_reset(rsk, skb);
1566 discard:
1567         kfree_skb(skb);
1568         /* Be careful here. If this function gets more complicated and
1569          * gcc suffers from register pressure on the x86, sk (in %ebx)
1570          * might be destroyed here. This current version compiles correctly,
1571          * but you have been warned.
1572          */
1573         return 0;
1574
1575 csum_err:
1576         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1577         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1578         goto discard;
1579 }
1580 EXPORT_SYMBOL(tcp_v4_do_rcv);
1581
1582 int tcp_v4_early_demux(struct sk_buff *skb)
1583 {
1584         const struct iphdr *iph;
1585         const struct tcphdr *th;
1586         struct sock *sk;
1587
1588         if (skb->pkt_type != PACKET_HOST)
1589                 return 0;
1590
1591         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1592                 return 0;
1593
1594         iph = ip_hdr(skb);
1595         th = tcp_hdr(skb);
1596
1597         if (th->doff < sizeof(struct tcphdr) / 4)
1598                 return 0;
1599
1600         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1601                                        iph->saddr, th->source,
1602                                        iph->daddr, ntohs(th->dest),
1603                                        skb->skb_iif, inet_sdif(skb));
1604         if (sk) {
1605                 skb->sk = sk;
1606                 skb->destructor = sock_edemux;
1607                 if (sk_fullsock(sk)) {
1608                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1609
1610                         if (dst)
1611                                 dst = dst_check(dst, 0);
1612                         if (dst &&
1613                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1614                                 skb_dst_set_noref(skb, dst);
1615                 }
1616         }
1617         return 0;
1618 }
1619
1620 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1621 {
1622         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1623         struct skb_shared_info *shinfo;
1624         const struct tcphdr *th;
1625         struct tcphdr *thtail;
1626         struct sk_buff *tail;
1627         unsigned int hdrlen;
1628         bool fragstolen;
1629         u32 gso_segs;
1630         int delta;
1631
1632         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1633          * we can fix skb->truesize to its real value to avoid future drops.
1634          * This is valid because skb is not yet charged to the socket.
1635          * It has been noticed pure SACK packets were sometimes dropped
1636          * (if cooked by drivers without copybreak feature).
1637          */
1638         skb_condense(skb);
1639
1640         skb_dst_drop(skb);
1641
1642         if (unlikely(tcp_checksum_complete(skb))) {
1643                 bh_unlock_sock(sk);
1644                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1645                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1646                 return true;
1647         }
1648
1649         /* Attempt coalescing to last skb in backlog, even if we are
1650          * above the limits.
1651          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1652          */
1653         th = (const struct tcphdr *)skb->data;
1654         hdrlen = th->doff * 4;
1655         shinfo = skb_shinfo(skb);
1656
1657         if (!shinfo->gso_size)
1658                 shinfo->gso_size = skb->len - hdrlen;
1659
1660         if (!shinfo->gso_segs)
1661                 shinfo->gso_segs = 1;
1662
1663         tail = sk->sk_backlog.tail;
1664         if (!tail)
1665                 goto no_coalesce;
1666         thtail = (struct tcphdr *)tail->data;
1667
1668         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1669             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1670             ((TCP_SKB_CB(tail)->tcp_flags |
1671               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1672             !((TCP_SKB_CB(tail)->tcp_flags &
1673               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1674             ((TCP_SKB_CB(tail)->tcp_flags ^
1675               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676 #ifdef CONFIG_TLS_DEVICE
1677             tail->decrypted != skb->decrypted ||
1678 #endif
1679             thtail->doff != th->doff ||
1680             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681                 goto no_coalesce;
1682
1683         __skb_pull(skb, hdrlen);
1684         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685                 thtail->window = th->window;
1686
1687                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1688
1689                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1691
1692                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1693                  * thtail->fin, so that the fast path in tcp_rcv_established()
1694                  * is not entered if we append a packet with a FIN.
1695                  * SYN, RST, URG are not present.
1696                  * ACK is set on both packets.
1697                  * PSH : we do not really care in TCP stack,
1698                  *       at least for 'GRO' packets.
1699                  */
1700                 thtail->fin |= th->fin;
1701                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1702
1703                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1704                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1705                         tail->tstamp = skb->tstamp;
1706                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1707                 }
1708
1709                 /* Not as strict as GRO. We only need to carry mss max value */
1710                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1711                                                  skb_shinfo(tail)->gso_size);
1712
1713                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1714                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1715
1716                 sk->sk_backlog.len += delta;
1717                 __NET_INC_STATS(sock_net(sk),
1718                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1719                 kfree_skb_partial(skb, fragstolen);
1720                 return false;
1721         }
1722         __skb_push(skb, hdrlen);
1723
1724 no_coalesce:
1725         /* Only socket owner can try to collapse/prune rx queues
1726          * to reduce memory overhead, so add a little headroom here.
1727          * Few sockets backlog are possibly concurrently non empty.
1728          */
1729         limit += 64*1024;
1730
1731         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1732                 bh_unlock_sock(sk);
1733                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1734                 return true;
1735         }
1736         return false;
1737 }
1738 EXPORT_SYMBOL(tcp_add_backlog);
1739
1740 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1741 {
1742         struct tcphdr *th = (struct tcphdr *)skb->data;
1743
1744         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1745 }
1746 EXPORT_SYMBOL(tcp_filter);
1747
1748 static void tcp_v4_restore_cb(struct sk_buff *skb)
1749 {
1750         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1751                 sizeof(struct inet_skb_parm));
1752 }
1753
1754 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1755                            const struct tcphdr *th)
1756 {
1757         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1758          * barrier() makes sure compiler wont play fool^Waliasing games.
1759          */
1760         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1761                 sizeof(struct inet_skb_parm));
1762         barrier();
1763
1764         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1765         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1766                                     skb->len - th->doff * 4);
1767         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1768         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1769         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1770         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1771         TCP_SKB_CB(skb)->sacked  = 0;
1772         TCP_SKB_CB(skb)->has_rxtstamp =
1773                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1774 }
1775
1776 /*
1777  *      From tcp_input.c
1778  */
1779
1780 int tcp_v4_rcv(struct sk_buff *skb)
1781 {
1782         struct net *net = dev_net(skb->dev);
1783         struct sk_buff *skb_to_free;
1784         int sdif = inet_sdif(skb);
1785         const struct iphdr *iph;
1786         const struct tcphdr *th;
1787         bool refcounted;
1788         struct sock *sk;
1789         int ret;
1790
1791         if (skb->pkt_type != PACKET_HOST)
1792                 goto discard_it;
1793
1794         /* Count it even if it's bad */
1795         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1796
1797         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1798                 goto discard_it;
1799
1800         th = (const struct tcphdr *)skb->data;
1801
1802         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1803                 goto bad_packet;
1804         if (!pskb_may_pull(skb, th->doff * 4))
1805                 goto discard_it;
1806
1807         /* An explanation is required here, I think.
1808          * Packet length and doff are validated by header prediction,
1809          * provided case of th->doff==0 is eliminated.
1810          * So, we defer the checks. */
1811
1812         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1813                 goto csum_error;
1814
1815         th = (const struct tcphdr *)skb->data;
1816         iph = ip_hdr(skb);
1817 lookup:
1818         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1819                                th->dest, sdif, &refcounted);
1820         if (!sk)
1821                 goto no_tcp_socket;
1822
1823 process:
1824         if (sk->sk_state == TCP_TIME_WAIT)
1825                 goto do_time_wait;
1826
1827         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1828                 struct request_sock *req = inet_reqsk(sk);
1829                 bool req_stolen = false;
1830                 struct sock *nsk;
1831
1832                 sk = req->rsk_listener;
1833                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1834                         sk_drops_add(sk, skb);
1835                         reqsk_put(req);
1836                         goto discard_it;
1837                 }
1838                 if (tcp_checksum_complete(skb)) {
1839                         reqsk_put(req);
1840                         goto csum_error;
1841                 }
1842                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1843                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1844                         goto lookup;
1845                 }
1846                 /* We own a reference on the listener, increase it again
1847                  * as we might lose it too soon.
1848                  */
1849                 sock_hold(sk);
1850                 refcounted = true;
1851                 nsk = NULL;
1852                 if (!tcp_filter(sk, skb)) {
1853                         th = (const struct tcphdr *)skb->data;
1854                         iph = ip_hdr(skb);
1855                         tcp_v4_fill_cb(skb, iph, th);
1856                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1857                 }
1858                 if (!nsk) {
1859                         reqsk_put(req);
1860                         if (req_stolen) {
1861                                 /* Another cpu got exclusive access to req
1862                                  * and created a full blown socket.
1863                                  * Try to feed this packet to this socket
1864                                  * instead of discarding it.
1865                                  */
1866                                 tcp_v4_restore_cb(skb);
1867                                 sock_put(sk);
1868                                 goto lookup;
1869                         }
1870                         goto discard_and_relse;
1871                 }
1872                 if (nsk == sk) {
1873                         reqsk_put(req);
1874                         tcp_v4_restore_cb(skb);
1875                 } else if (tcp_child_process(sk, nsk, skb)) {
1876                         tcp_v4_send_reset(nsk, skb);
1877                         goto discard_and_relse;
1878                 } else {
1879                         sock_put(sk);
1880                         return 0;
1881                 }
1882         }
1883         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1884                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1885                 goto discard_and_relse;
1886         }
1887
1888         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1889                 goto discard_and_relse;
1890
1891         if (tcp_v4_inbound_md5_hash(sk, skb))
1892                 goto discard_and_relse;
1893
1894         nf_reset(skb);
1895
1896         if (tcp_filter(sk, skb))
1897                 goto discard_and_relse;
1898         th = (const struct tcphdr *)skb->data;
1899         iph = ip_hdr(skb);
1900         tcp_v4_fill_cb(skb, iph, th);
1901
1902         skb->dev = NULL;
1903
1904         if (sk->sk_state == TCP_LISTEN) {
1905                 ret = tcp_v4_do_rcv(sk, skb);
1906                 goto put_and_return;
1907         }
1908
1909         sk_incoming_cpu_update(sk);
1910
1911         bh_lock_sock_nested(sk);
1912         tcp_segs_in(tcp_sk(sk), skb);
1913         ret = 0;
1914         if (!sock_owned_by_user(sk)) {
1915                 skb_to_free = sk->sk_rx_skb_cache;
1916                 sk->sk_rx_skb_cache = NULL;
1917                 ret = tcp_v4_do_rcv(sk, skb);
1918         } else {
1919                 if (tcp_add_backlog(sk, skb))
1920                         goto discard_and_relse;
1921                 skb_to_free = NULL;
1922         }
1923         bh_unlock_sock(sk);
1924         if (skb_to_free)
1925                 __kfree_skb(skb_to_free);
1926
1927 put_and_return:
1928         if (refcounted)
1929                 sock_put(sk);
1930
1931         return ret;
1932
1933 no_tcp_socket:
1934         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1935                 goto discard_it;
1936
1937         tcp_v4_fill_cb(skb, iph, th);
1938
1939         if (tcp_checksum_complete(skb)) {
1940 csum_error:
1941                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1942 bad_packet:
1943                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1944         } else {
1945                 tcp_v4_send_reset(NULL, skb);
1946         }
1947
1948 discard_it:
1949         /* Discard frame. */
1950         kfree_skb(skb);
1951         return 0;
1952
1953 discard_and_relse:
1954         sk_drops_add(sk, skb);
1955         if (refcounted)
1956                 sock_put(sk);
1957         goto discard_it;
1958
1959 do_time_wait:
1960         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1961                 inet_twsk_put(inet_twsk(sk));
1962                 goto discard_it;
1963         }
1964
1965         tcp_v4_fill_cb(skb, iph, th);
1966
1967         if (tcp_checksum_complete(skb)) {
1968                 inet_twsk_put(inet_twsk(sk));
1969                 goto csum_error;
1970         }
1971         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1972         case TCP_TW_SYN: {
1973                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1974                                                         &tcp_hashinfo, skb,
1975                                                         __tcp_hdrlen(th),
1976                                                         iph->saddr, th->source,
1977                                                         iph->daddr, th->dest,
1978                                                         inet_iif(skb),
1979                                                         sdif);
1980                 if (sk2) {
1981                         inet_twsk_deschedule_put(inet_twsk(sk));
1982                         sk = sk2;
1983                         tcp_v4_restore_cb(skb);
1984                         refcounted = false;
1985                         goto process;
1986                 }
1987         }
1988                 /* to ACK */
1989                 /* fall through */
1990         case TCP_TW_ACK:
1991                 tcp_v4_timewait_ack(sk, skb);
1992                 break;
1993         case TCP_TW_RST:
1994                 tcp_v4_send_reset(sk, skb);
1995                 inet_twsk_deschedule_put(inet_twsk(sk));
1996                 goto discard_it;
1997         case TCP_TW_SUCCESS:;
1998         }
1999         goto discard_it;
2000 }
2001
2002 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2003         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2004         .twsk_unique    = tcp_twsk_unique,
2005         .twsk_destructor= tcp_twsk_destructor,
2006 };
2007
2008 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2009 {
2010         struct dst_entry *dst = skb_dst(skb);
2011
2012         if (dst && dst_hold_safe(dst)) {
2013                 sk->sk_rx_dst = dst;
2014                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2015         }
2016 }
2017 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2018
2019 const struct inet_connection_sock_af_ops ipv4_specific = {
2020         .queue_xmit        = ip_queue_xmit,
2021         .send_check        = tcp_v4_send_check,
2022         .rebuild_header    = inet_sk_rebuild_header,
2023         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2024         .conn_request      = tcp_v4_conn_request,
2025         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2026         .net_header_len    = sizeof(struct iphdr),
2027         .setsockopt        = ip_setsockopt,
2028         .getsockopt        = ip_getsockopt,
2029         .addr2sockaddr     = inet_csk_addr2sockaddr,
2030         .sockaddr_len      = sizeof(struct sockaddr_in),
2031 #ifdef CONFIG_COMPAT
2032         .compat_setsockopt = compat_ip_setsockopt,
2033         .compat_getsockopt = compat_ip_getsockopt,
2034 #endif
2035         .mtu_reduced       = tcp_v4_mtu_reduced,
2036 };
2037 EXPORT_SYMBOL(ipv4_specific);
2038
2039 #ifdef CONFIG_TCP_MD5SIG
2040 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2041         .md5_lookup             = tcp_v4_md5_lookup,
2042         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2043         .md5_parse              = tcp_v4_parse_md5_keys,
2044 };
2045 #endif
2046
2047 /* NOTE: A lot of things set to zero explicitly by call to
2048  *       sk_alloc() so need not be done here.
2049  */
2050 static int tcp_v4_init_sock(struct sock *sk)
2051 {
2052         struct inet_connection_sock *icsk = inet_csk(sk);
2053
2054         tcp_init_sock(sk);
2055
2056         icsk->icsk_af_ops = &ipv4_specific;
2057
2058 #ifdef CONFIG_TCP_MD5SIG
2059         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2060 #endif
2061
2062         return 0;
2063 }
2064
2065 void tcp_v4_destroy_sock(struct sock *sk)
2066 {
2067         struct tcp_sock *tp = tcp_sk(sk);
2068
2069         trace_tcp_destroy_sock(sk);
2070
2071         tcp_clear_xmit_timers(sk);
2072
2073         tcp_cleanup_congestion_control(sk);
2074
2075         tcp_cleanup_ulp(sk);
2076
2077         /* Cleanup up the write buffer. */
2078         tcp_write_queue_purge(sk);
2079
2080         /* Check if we want to disable active TFO */
2081         tcp_fastopen_active_disable_ofo_check(sk);
2082
2083         /* Cleans up our, hopefully empty, out_of_order_queue. */
2084         skb_rbtree_purge(&tp->out_of_order_queue);
2085
2086 #ifdef CONFIG_TCP_MD5SIG
2087         /* Clean up the MD5 key list, if any */
2088         if (tp->md5sig_info) {
2089                 tcp_clear_md5_list(sk);
2090                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2091                 tp->md5sig_info = NULL;
2092         }
2093 #endif
2094
2095         /* Clean up a referenced TCP bind bucket. */
2096         if (inet_csk(sk)->icsk_bind_hash)
2097                 inet_put_port(sk);
2098
2099         BUG_ON(tp->fastopen_rsk);
2100
2101         /* If socket is aborted during connect operation */
2102         tcp_free_fastopen_req(tp);
2103         tcp_fastopen_destroy_cipher(sk);
2104         tcp_saved_syn_free(tp);
2105
2106         sk_sockets_allocated_dec(sk);
2107 }
2108 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2109
2110 #ifdef CONFIG_PROC_FS
2111 /* Proc filesystem TCP sock list dumping. */
2112
2113 /*
2114  * Get next listener socket follow cur.  If cur is NULL, get first socket
2115  * starting from bucket given in st->bucket; when st->bucket is zero the
2116  * very first socket in the hash table is returned.
2117  */
2118 static void *listening_get_next(struct seq_file *seq, void *cur)
2119 {
2120         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2121         struct tcp_iter_state *st = seq->private;
2122         struct net *net = seq_file_net(seq);
2123         struct inet_listen_hashbucket *ilb;
2124         struct sock *sk = cur;
2125
2126         if (!sk) {
2127 get_head:
2128                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2129                 spin_lock(&ilb->lock);
2130                 sk = sk_head(&ilb->head);
2131                 st->offset = 0;
2132                 goto get_sk;
2133         }
2134         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2135         ++st->num;
2136         ++st->offset;
2137
2138         sk = sk_next(sk);
2139 get_sk:
2140         sk_for_each_from(sk) {
2141                 if (!net_eq(sock_net(sk), net))
2142                         continue;
2143                 if (sk->sk_family == afinfo->family)
2144                         return sk;
2145         }
2146         spin_unlock(&ilb->lock);
2147         st->offset = 0;
2148         if (++st->bucket < INET_LHTABLE_SIZE)
2149                 goto get_head;
2150         return NULL;
2151 }
2152
2153 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2154 {
2155         struct tcp_iter_state *st = seq->private;
2156         void *rc;
2157
2158         st->bucket = 0;
2159         st->offset = 0;
2160         rc = listening_get_next(seq, NULL);
2161
2162         while (rc && *pos) {
2163                 rc = listening_get_next(seq, rc);
2164                 --*pos;
2165         }
2166         return rc;
2167 }
2168
2169 static inline bool empty_bucket(const struct tcp_iter_state *st)
2170 {
2171         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2172 }
2173
2174 /*
2175  * Get first established socket starting from bucket given in st->bucket.
2176  * If st->bucket is zero, the very first socket in the hash is returned.
2177  */
2178 static void *established_get_first(struct seq_file *seq)
2179 {
2180         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2181         struct tcp_iter_state *st = seq->private;
2182         struct net *net = seq_file_net(seq);
2183         void *rc = NULL;
2184
2185         st->offset = 0;
2186         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2187                 struct sock *sk;
2188                 struct hlist_nulls_node *node;
2189                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2190
2191                 /* Lockless fast path for the common case of empty buckets */
2192                 if (empty_bucket(st))
2193                         continue;
2194
2195                 spin_lock_bh(lock);
2196                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2197                         if (sk->sk_family != afinfo->family ||
2198                             !net_eq(sock_net(sk), net)) {
2199                                 continue;
2200                         }
2201                         rc = sk;
2202                         goto out;
2203                 }
2204                 spin_unlock_bh(lock);
2205         }
2206 out:
2207         return rc;
2208 }
2209
2210 static void *established_get_next(struct seq_file *seq, void *cur)
2211 {
2212         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2213         struct sock *sk = cur;
2214         struct hlist_nulls_node *node;
2215         struct tcp_iter_state *st = seq->private;
2216         struct net *net = seq_file_net(seq);
2217
2218         ++st->num;
2219         ++st->offset;
2220
2221         sk = sk_nulls_next(sk);
2222
2223         sk_nulls_for_each_from(sk, node) {
2224                 if (sk->sk_family == afinfo->family &&
2225                     net_eq(sock_net(sk), net))
2226                         return sk;
2227         }
2228
2229         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2230         ++st->bucket;
2231         return established_get_first(seq);
2232 }
2233
2234 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2235 {
2236         struct tcp_iter_state *st = seq->private;
2237         void *rc;
2238
2239         st->bucket = 0;
2240         rc = established_get_first(seq);
2241
2242         while (rc && pos) {
2243                 rc = established_get_next(seq, rc);
2244                 --pos;
2245         }
2246         return rc;
2247 }
2248
2249 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2250 {
2251         void *rc;
2252         struct tcp_iter_state *st = seq->private;
2253
2254         st->state = TCP_SEQ_STATE_LISTENING;
2255         rc        = listening_get_idx(seq, &pos);
2256
2257         if (!rc) {
2258                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2259                 rc        = established_get_idx(seq, pos);
2260         }
2261
2262         return rc;
2263 }
2264
2265 static void *tcp_seek_last_pos(struct seq_file *seq)
2266 {
2267         struct tcp_iter_state *st = seq->private;
2268         int offset = st->offset;
2269         int orig_num = st->num;
2270         void *rc = NULL;
2271
2272         switch (st->state) {
2273         case TCP_SEQ_STATE_LISTENING:
2274                 if (st->bucket >= INET_LHTABLE_SIZE)
2275                         break;
2276                 st->state = TCP_SEQ_STATE_LISTENING;
2277                 rc = listening_get_next(seq, NULL);
2278                 while (offset-- && rc)
2279                         rc = listening_get_next(seq, rc);
2280                 if (rc)
2281                         break;
2282                 st->bucket = 0;
2283                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2284                 /* Fallthrough */
2285         case TCP_SEQ_STATE_ESTABLISHED:
2286                 if (st->bucket > tcp_hashinfo.ehash_mask)
2287                         break;
2288                 rc = established_get_first(seq);
2289                 while (offset-- && rc)
2290                         rc = established_get_next(seq, rc);
2291         }
2292
2293         st->num = orig_num;
2294
2295         return rc;
2296 }
2297
2298 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2299 {
2300         struct tcp_iter_state *st = seq->private;
2301         void *rc;
2302
2303         if (*pos && *pos == st->last_pos) {
2304                 rc = tcp_seek_last_pos(seq);
2305                 if (rc)
2306                         goto out;
2307         }
2308
2309         st->state = TCP_SEQ_STATE_LISTENING;
2310         st->num = 0;
2311         st->bucket = 0;
2312         st->offset = 0;
2313         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2314
2315 out:
2316         st->last_pos = *pos;
2317         return rc;
2318 }
2319 EXPORT_SYMBOL(tcp_seq_start);
2320
2321 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2322 {
2323         struct tcp_iter_state *st = seq->private;
2324         void *rc = NULL;
2325
2326         if (v == SEQ_START_TOKEN) {
2327                 rc = tcp_get_idx(seq, 0);
2328                 goto out;
2329         }
2330
2331         switch (st->state) {
2332         case TCP_SEQ_STATE_LISTENING:
2333                 rc = listening_get_next(seq, v);
2334                 if (!rc) {
2335                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2336                         st->bucket = 0;
2337                         st->offset = 0;
2338                         rc        = established_get_first(seq);
2339                 }
2340                 break;
2341         case TCP_SEQ_STATE_ESTABLISHED:
2342                 rc = established_get_next(seq, v);
2343                 break;
2344         }
2345 out:
2346         ++*pos;
2347         st->last_pos = *pos;
2348         return rc;
2349 }
2350 EXPORT_SYMBOL(tcp_seq_next);
2351
2352 void tcp_seq_stop(struct seq_file *seq, void *v)
2353 {
2354         struct tcp_iter_state *st = seq->private;
2355
2356         switch (st->state) {
2357         case TCP_SEQ_STATE_LISTENING:
2358                 if (v != SEQ_START_TOKEN)
2359                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2360                 break;
2361         case TCP_SEQ_STATE_ESTABLISHED:
2362                 if (v)
2363                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2364                 break;
2365         }
2366 }
2367 EXPORT_SYMBOL(tcp_seq_stop);
2368
2369 static void get_openreq4(const struct request_sock *req,
2370                          struct seq_file *f, int i)
2371 {
2372         const struct inet_request_sock *ireq = inet_rsk(req);
2373         long delta = req->rsk_timer.expires - jiffies;
2374
2375         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2376                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2377                 i,
2378                 ireq->ir_loc_addr,
2379                 ireq->ir_num,
2380                 ireq->ir_rmt_addr,
2381                 ntohs(ireq->ir_rmt_port),
2382                 TCP_SYN_RECV,
2383                 0, 0, /* could print option size, but that is af dependent. */
2384                 1,    /* timers active (only the expire timer) */
2385                 jiffies_delta_to_clock_t(delta),
2386                 req->num_timeout,
2387                 from_kuid_munged(seq_user_ns(f),
2388                                  sock_i_uid(req->rsk_listener)),
2389                 0,  /* non standard timer */
2390                 0, /* open_requests have no inode */
2391                 0,
2392                 req);
2393 }
2394
2395 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2396 {
2397         int timer_active;
2398         unsigned long timer_expires;
2399         const struct tcp_sock *tp = tcp_sk(sk);
2400         const struct inet_connection_sock *icsk = inet_csk(sk);
2401         const struct inet_sock *inet = inet_sk(sk);
2402         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2403         __be32 dest = inet->inet_daddr;
2404         __be32 src = inet->inet_rcv_saddr;
2405         __u16 destp = ntohs(inet->inet_dport);
2406         __u16 srcp = ntohs(inet->inet_sport);
2407         int rx_queue;
2408         int state;
2409
2410         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2411             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2412             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2413                 timer_active    = 1;
2414                 timer_expires   = icsk->icsk_timeout;
2415         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2416                 timer_active    = 4;
2417                 timer_expires   = icsk->icsk_timeout;
2418         } else if (timer_pending(&sk->sk_timer)) {
2419                 timer_active    = 2;
2420                 timer_expires   = sk->sk_timer.expires;
2421         } else {
2422                 timer_active    = 0;
2423                 timer_expires = jiffies;
2424         }
2425
2426         state = inet_sk_state_load(sk);
2427         if (state == TCP_LISTEN)
2428                 rx_queue = sk->sk_ack_backlog;
2429         else
2430                 /* Because we don't lock the socket,
2431                  * we might find a transient negative value.
2432                  */
2433                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2434
2435         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2437                 i, src, srcp, dest, destp, state,
2438                 tp->write_seq - tp->snd_una,
2439                 rx_queue,
2440                 timer_active,
2441                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2442                 icsk->icsk_retransmits,
2443                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2444                 icsk->icsk_probes_out,
2445                 sock_i_ino(sk),
2446                 refcount_read(&sk->sk_refcnt), sk,
2447                 jiffies_to_clock_t(icsk->icsk_rto),
2448                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2449                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2450                 tp->snd_cwnd,
2451                 state == TCP_LISTEN ?
2452                     fastopenq->max_qlen :
2453                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2454 }
2455
2456 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2457                                struct seq_file *f, int i)
2458 {
2459         long delta = tw->tw_timer.expires - jiffies;
2460         __be32 dest, src;
2461         __u16 destp, srcp;
2462
2463         dest  = tw->tw_daddr;
2464         src   = tw->tw_rcv_saddr;
2465         destp = ntohs(tw->tw_dport);
2466         srcp  = ntohs(tw->tw_sport);
2467
2468         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2469                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2470                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2471                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2472                 refcount_read(&tw->tw_refcnt), tw);
2473 }
2474
2475 #define TMPSZ 150
2476
2477 static int tcp4_seq_show(struct seq_file *seq, void *v)
2478 {
2479         struct tcp_iter_state *st;
2480         struct sock *sk = v;
2481
2482         seq_setwidth(seq, TMPSZ - 1);
2483         if (v == SEQ_START_TOKEN) {
2484                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2485                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2486                            "inode");
2487                 goto out;
2488         }
2489         st = seq->private;
2490
2491         if (sk->sk_state == TCP_TIME_WAIT)
2492                 get_timewait4_sock(v, seq, st->num);
2493         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2494                 get_openreq4(v, seq, st->num);
2495         else
2496                 get_tcp4_sock(v, seq, st->num);
2497 out:
2498         seq_pad(seq, '\n');
2499         return 0;
2500 }
2501
2502 static const struct seq_operations tcp4_seq_ops = {
2503         .show           = tcp4_seq_show,
2504         .start          = tcp_seq_start,
2505         .next           = tcp_seq_next,
2506         .stop           = tcp_seq_stop,
2507 };
2508
2509 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2510         .family         = AF_INET,
2511 };
2512
2513 static int __net_init tcp4_proc_init_net(struct net *net)
2514 {
2515         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2516                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2517                 return -ENOMEM;
2518         return 0;
2519 }
2520
2521 static void __net_exit tcp4_proc_exit_net(struct net *net)
2522 {
2523         remove_proc_entry("tcp", net->proc_net);
2524 }
2525
2526 static struct pernet_operations tcp4_net_ops = {
2527         .init = tcp4_proc_init_net,
2528         .exit = tcp4_proc_exit_net,
2529 };
2530
2531 int __init tcp4_proc_init(void)
2532 {
2533         return register_pernet_subsys(&tcp4_net_ops);
2534 }
2535
2536 void tcp4_proc_exit(void)
2537 {
2538         unregister_pernet_subsys(&tcp4_net_ops);
2539 }
2540 #endif /* CONFIG_PROC_FS */
2541
2542 struct proto tcp_prot = {
2543         .name                   = "TCP",
2544         .owner                  = THIS_MODULE,
2545         .close                  = tcp_close,
2546         .pre_connect            = tcp_v4_pre_connect,
2547         .connect                = tcp_v4_connect,
2548         .disconnect             = tcp_disconnect,
2549         .accept                 = inet_csk_accept,
2550         .ioctl                  = tcp_ioctl,
2551         .init                   = tcp_v4_init_sock,
2552         .destroy                = tcp_v4_destroy_sock,
2553         .shutdown               = tcp_shutdown,
2554         .setsockopt             = tcp_setsockopt,
2555         .getsockopt             = tcp_getsockopt,
2556         .keepalive              = tcp_set_keepalive,
2557         .recvmsg                = tcp_recvmsg,
2558         .sendmsg                = tcp_sendmsg,
2559         .sendpage               = tcp_sendpage,
2560         .backlog_rcv            = tcp_v4_do_rcv,
2561         .release_cb             = tcp_release_cb,
2562         .hash                   = inet_hash,
2563         .unhash                 = inet_unhash,
2564         .get_port               = inet_csk_get_port,
2565         .enter_memory_pressure  = tcp_enter_memory_pressure,
2566         .leave_memory_pressure  = tcp_leave_memory_pressure,
2567         .stream_memory_free     = tcp_stream_memory_free,
2568         .sockets_allocated      = &tcp_sockets_allocated,
2569         .orphan_count           = &tcp_orphan_count,
2570         .memory_allocated       = &tcp_memory_allocated,
2571         .memory_pressure        = &tcp_memory_pressure,
2572         .sysctl_mem             = sysctl_tcp_mem,
2573         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2574         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2575         .max_header             = MAX_TCP_HEADER,
2576         .obj_size               = sizeof(struct tcp_sock),
2577         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2578         .twsk_prot              = &tcp_timewait_sock_ops,
2579         .rsk_prot               = &tcp_request_sock_ops,
2580         .h.hashinfo             = &tcp_hashinfo,
2581         .no_autobind            = true,
2582 #ifdef CONFIG_COMPAT
2583         .compat_setsockopt      = compat_tcp_setsockopt,
2584         .compat_getsockopt      = compat_tcp_getsockopt,
2585 #endif
2586         .diag_destroy           = tcp_abort,
2587 };
2588 EXPORT_SYMBOL(tcp_prot);
2589
2590 static void __net_exit tcp_sk_exit(struct net *net)
2591 {
2592         int cpu;
2593
2594         if (net->ipv4.tcp_congestion_control)
2595                 module_put(net->ipv4.tcp_congestion_control->owner);
2596
2597         for_each_possible_cpu(cpu)
2598                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2599         free_percpu(net->ipv4.tcp_sk);
2600 }
2601
2602 static int __net_init tcp_sk_init(struct net *net)
2603 {
2604         int res, cpu, cnt;
2605
2606         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2607         if (!net->ipv4.tcp_sk)
2608                 return -ENOMEM;
2609
2610         for_each_possible_cpu(cpu) {
2611                 struct sock *sk;
2612
2613                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2614                                            IPPROTO_TCP, net);
2615                 if (res)
2616                         goto fail;
2617                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2618
2619                 /* Please enforce IP_DF and IPID==0 for RST and
2620                  * ACK sent in SYN-RECV and TIME-WAIT state.
2621                  */
2622                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2623
2624                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2625         }
2626
2627         net->ipv4.sysctl_tcp_ecn = 2;
2628         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2629
2630         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2631         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2632         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2633         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2634
2635         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2636         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2637         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2638
2639         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2640         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2641         net->ipv4.sysctl_tcp_syncookies = 1;
2642         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2643         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2644         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2645         net->ipv4.sysctl_tcp_orphan_retries = 0;
2646         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2647         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2648         net->ipv4.sysctl_tcp_tw_reuse = 2;
2649
2650         cnt = tcp_hashinfo.ehash_mask + 1;
2651         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2652         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2653
2654         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2655         net->ipv4.sysctl_tcp_sack = 1;
2656         net->ipv4.sysctl_tcp_window_scaling = 1;
2657         net->ipv4.sysctl_tcp_timestamps = 1;
2658         net->ipv4.sysctl_tcp_early_retrans = 3;
2659         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2660         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2661         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2662         net->ipv4.sysctl_tcp_max_reordering = 300;
2663         net->ipv4.sysctl_tcp_dsack = 1;
2664         net->ipv4.sysctl_tcp_app_win = 31;
2665         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2666         net->ipv4.sysctl_tcp_frto = 2;
2667         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2668         /* This limits the percentage of the congestion window which we
2669          * will allow a single TSO frame to consume.  Building TSO frames
2670          * which are too large can cause TCP streams to be bursty.
2671          */
2672         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2673         /* Default TSQ limit of 16 TSO segments */
2674         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2675         /* rfc5961 challenge ack rate limiting */
2676         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2677         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2678         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2679         net->ipv4.sysctl_tcp_autocorking = 1;
2680         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2681         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2682         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2683         if (net != &init_net) {
2684                 memcpy(net->ipv4.sysctl_tcp_rmem,
2685                        init_net.ipv4.sysctl_tcp_rmem,
2686                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2687                 memcpy(net->ipv4.sysctl_tcp_wmem,
2688                        init_net.ipv4.sysctl_tcp_wmem,
2689                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2690         }
2691         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2692         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2693         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2694         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2695         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2696         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2697
2698         /* Reno is always built in */
2699         if (!net_eq(net, &init_net) &&
2700             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2701                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2702         else
2703                 net->ipv4.tcp_congestion_control = &tcp_reno;
2704
2705         return 0;
2706 fail:
2707         tcp_sk_exit(net);
2708
2709         return res;
2710 }
2711
2712 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2713 {
2714         struct net *net;
2715
2716         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2717
2718         list_for_each_entry(net, net_exit_list, exit_list)
2719                 tcp_fastopen_ctx_destroy(net);
2720 }
2721
2722 static struct pernet_operations __net_initdata tcp_sk_ops = {
2723        .init       = tcp_sk_init,
2724        .exit       = tcp_sk_exit,
2725        .exit_batch = tcp_sk_exit_batch,
2726 };
2727
2728 void __init tcp_v4_init(void)
2729 {
2730         if (register_pernet_subsys(&tcp_sk_ops))
2731                 panic("Failed to create the TCP control socket.\n");
2732 }