net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                         if (tp->write_seq == 0)
 169                                 tp->write_seq = 1;
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         tp->write_seq      = 0;
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295                                                        inet->inet_daddr,
 296                                                        inet->inet_sport,
 297                                                        usin->sin_port);
 298                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299                                                  inet->inet_saddr,
 300                                                  inet->inet_daddr);
 301         }
 302
 303         inet->inet_id = tp->write_seq ^ jiffies;
 304
 305         if (tcp_fastopen_defer_connect(sk, &err))
 306                 return err;
 307         if (err)
 308                 goto failure;
 309
 310         err = tcp_connect(sk);
 311
 312         if (err)
 313                 goto failure;
 314
 315         return 0;
 316
 317 failure:
 318         /*
 319          * This unhashes the socket and releases the local port,
 320          * if necessary.
 321          */
 322         tcp_set_state(sk, TCP_CLOSE);
 323         ip_rt_put(rt);
 324         sk->sk_route_caps = 0;
 325         inet->inet_dport = 0;
 326         return err;
 327 }
 328 EXPORT_SYMBOL(tcp_v4_connect);
 329
 330 /*
 331  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332  * It can be called through tcp_release_cb() if socket was owned by user
 333  * at the time tcp_v4_err() was called to handle ICMP message.
 334  */
 335 void tcp_v4_mtu_reduced(struct sock *sk)
 336 {
 337         struct inet_sock *inet = inet_sk(sk);
 338         struct dst_entry *dst;
 339         u32 mtu;
 340
 341         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342                 return;
 343         mtu = tcp_sk(sk)->mtu_info;
 344         dst = inet_csk_update_pmtu(sk, mtu);
 345         if (!dst)
 346                 return;
 347
 348         /* Something is about to be wrong... Remember soft error
 349          * for the case, if this connection will not able to recover.
 350          */
 351         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352                 sk->sk_err_soft = EMSGSIZE;
 353
 354         mtu = dst_mtu(dst);
 355
 356         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357             ip_sk_accept_pmtu(sk) &&
 358             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359                 tcp_sync_mss(sk, mtu);
 360
 361                 /* Resend the TCP packet because it's
 362                  * clear that the old packet has been
 363                  * dropped. This is the new "fast" path mtu
 364                  * discovery.
 365                  */
 366                 tcp_simple_retransmit(sk);
 367         } /* else let the usual retransmit timer handle it */
 368 }
 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372 {
 373         struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375         if (dst)
 376                 dst->ops->redirect(dst, sk, skb);
 377 }
 378
 379
 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382 {
 383         struct request_sock *req = inet_reqsk(sk);
 384         struct net *net = sock_net(sk);
 385
 386         /* ICMPs are not backlogged, hence we cannot get
 387          * an established socket here.
 388          */
 389         if (seq != tcp_rsk(req)->snt_isn) {
 390                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391         } else if (abort) {
 392                 /*
 393                  * Still in SYN_RECV, just remove it silently.
 394                  * There is no good way to pass the error to the newly
 395                  * created socket, and POSIX does not want network
 396                  * errors returned from accept().
 397                  */
 398                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399                 tcp_listendrop(req->rsk_listener);
 400         }
 401         reqsk_put(req);
 402 }
 403 EXPORT_SYMBOL(tcp_req_err);
 404
 405 /*
 406  * This routine is called by the ICMP module when it gets some
 407  * sort of error condition.  If err < 0 then the socket should
 408  * be closed and the error returned to the user.  If err > 0
 409  * it's just the icmp type << 8 | icmp code.  After adjustment
 410  * header points to the first 8 bytes of the tcp header.  We need
 411  * to find the appropriate port.
 412  *
 413  * The locking strategy used here is very "optimistic". When
 414  * someone else accesses the socket the ICMP is just dropped
 415  * and for some paths there is no check at all.
 416  * A more general error queue to queue errors for later handling
 417  * is probably better.
 418  *
 419  */
 420
 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422 {
 423         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425         struct inet_connection_sock *icsk;
 426         struct tcp_sock *tp;
 427         struct inet_sock *inet;
 428         const int type = icmp_hdr(icmp_skb)->type;
 429         const int code = icmp_hdr(icmp_skb)->code;
 430         struct sock *sk;
 431         struct sk_buff *skb;
 432         struct request_sock *fastopen;
 433         u32 seq, snd_una;
 434         s32 remaining;
 435         u32 delta_us;
 436         int err;
 437         struct net *net = dev_net(icmp_skb->dev);
 438
 439         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440                                        th->dest, iph->saddr, ntohs(th->source),
 441                                        inet_iif(icmp_skb), 0);
 442         if (!sk) {
 443                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444                 return -ENOENT;
 445         }
 446         if (sk->sk_state == TCP_TIME_WAIT) {
 447                 inet_twsk_put(inet_twsk(sk));
 448                 return 0;
 449         }
 450         seq = ntohl(th->seq);
 451         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453                                      type == ICMP_TIME_EXCEEDED ||
 454                                      (type == ICMP_DEST_UNREACH &&
 455                                       (code == ICMP_NET_UNREACH ||
 456                                        code == ICMP_HOST_UNREACH)));
 457                 return 0;
 458         }
 459
 460         bh_lock_sock(sk);
 461         /* If too many ICMPs get dropped on busy
 462          * servers this needs to be solved differently.
 463          * We do take care of PMTU discovery (RFC1191) special case :
 464          * we can receive locally generated ICMP messages while socket is held.
 465          */
 466         if (sock_owned_by_user(sk)) {
 467                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469         }
 470         if (sk->sk_state == TCP_CLOSE)
 471                 goto out;
 472
 473         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475                 goto out;
 476         }
 477
 478         icsk = inet_csk(sk);
 479         tp = tcp_sk(sk);
 480         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481         fastopen = tp->fastopen_rsk;
 482         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483         if (sk->sk_state != TCP_LISTEN &&
 484             !between(seq, snd_una, tp->snd_nxt)) {
 485                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486                 goto out;
 487         }
 488
 489         switch (type) {
 490         case ICMP_REDIRECT:
 491                 if (!sock_owned_by_user(sk))
 492                         do_redirect(icmp_skb, sk);
 493                 goto out;
 494         case ICMP_SOURCE_QUENCH:
 495                 /* Just silently ignore these. */
 496                 goto out;
 497         case ICMP_PARAMETERPROB:
 498                 err = EPROTO;
 499                 break;
 500         case ICMP_DEST_UNREACH:
 501                 if (code > NR_ICMP_UNREACH)
 502                         goto out;
 503
 504                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505                         /* We are not interested in TCP_LISTEN and open_requests
 506                          * (SYN-ACKs send out by Linux are always <576bytes so
 507                          * they should go through unfragmented).
 508                          */
 509                         if (sk->sk_state == TCP_LISTEN)
 510                                 goto out;
 511
 512                         tp->mtu_info = info;
 513                         if (!sock_owned_by_user(sk)) {
 514                                 tcp_v4_mtu_reduced(sk);
 515                         } else {
 516                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517                                         sock_hold(sk);
 518                         }
 519                         goto out;
 520                 }
 521
 522                 err = icmp_err_convert[code].errno;
 523                 /* check if icmp_skb allows revert of backoff
 524                  * (see draft-zimmermann-tcp-lcd) */
 525                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526                         break;
 527                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528                     !icsk->icsk_backoff || fastopen)
 529                         break;
 530
 531                 if (sock_owned_by_user(sk))
 532                         break;
 533
 534                 skb = tcp_rtx_queue_head(sk);
 535                 if (WARN_ON_ONCE(!skb))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544                 tcp_mstamp_refresh(tp);
 545                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546                 remaining = icsk->icsk_rto -
 547                             usecs_to_jiffies(delta_us);
 548
 549                 if (remaining > 0) {
 550                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551                                                   remaining, TCP_RTO_MAX);
 552                 } else {
 553                         /* RTO revert clocked out retransmission.
 554                          * Will retransmit now */
 555                         tcp_retransmit_timer(sk);
 556                 }
 557
 558                 break;
 559         case ICMP_TIME_EXCEEDED:
 560                 err = EHOSTUNREACH;
 561                 break;
 562         default:
 563                 goto out;
 564         }
 565
 566         switch (sk->sk_state) {
 567         case TCP_SYN_SENT:
 568         case TCP_SYN_RECV:
 569                 /* Only in fast or simultaneous open. If a fast open socket is
 570                  * is already accepted it is treated as a connected one below.
 571                  */
 572                 if (fastopen && !fastopen->sk)
 573                         break;
 574
 575                 if (!sock_owned_by_user(sk)) {
 576                         sk->sk_err = err;
 577
 578                         sk->sk_error_report(sk);
 579
 580                         tcp_done(sk);
 581                 } else {
 582                         sk->sk_err_soft = err;
 583                 }
 584                 goto out;
 585         }
 586
 587         /* If we've already connected we will keep trying
 588          * until we time out, or the user gives up.
 589          *
 590          * rfc1122 4.2.3.9 allows to consider as hard errors
 591          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592          * but it is obsoleted by pmtu discovery).
 593          *
 594          * Note, that in modern internet, where routing is unreliable
 595          * and in each dark corner broken firewalls sit, sending random
 596          * errors ordered by their masters even this two messages finally lose
 597          * their original sense (even Linux sends invalid PORT_UNREACHs)
 598          *
 599          * Now we are in compliance with RFCs.
 600          *                                                      --ANK (980905)
 601          */
 602
 603         inet = inet_sk(sk);
 604         if (!sock_owned_by_user(sk) && inet->recverr) {
 605                 sk->sk_err = err;
 606                 sk->sk_error_report(sk);
 607         } else  { /* Only an error on timeout */
 608                 sk->sk_err_soft = err;
 609         }
 610
 611 out:
 612         bh_unlock_sock(sk);
 613         sock_put(sk);
 614         return 0;
 615 }
 616
 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618 {
 619         struct tcphdr *th = tcp_hdr(skb);
 620
 621         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622         skb->csum_start = skb_transport_header(skb) - skb->head;
 623         skb->csum_offset = offsetof(struct tcphdr, check);
 624 }
 625
 626 /* This routine computes an IPv4 TCP checksum. */
 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628 {
 629         const struct inet_sock *inet = inet_sk(sk);
 630
 631         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632 }
 633 EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635 /*
 636  *      This routine will send an RST to the other tcp.
 637  *
 638  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639  *                    for reset.
 640  *      Answer: if a packet caused RST, it is not for a socket
 641  *              existing in our system, if it is matched to a socket,
 642  *              it is just duplicate segment or bug in other side's TCP.
 643  *              So that we build reply only basing on parameters
 644  *              arrived with segment.
 645  *      Exception: precedence violation. We do not implement it in any case.
 646  */
 647
 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649 {
 650         const struct tcphdr *th = tcp_hdr(skb);
 651         struct {
 652                 struct tcphdr th;
 653 #ifdef CONFIG_TCP_MD5SIG
 654                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655 #endif
 656         } rep;
 657         struct ip_reply_arg arg;
 658 #ifdef CONFIG_TCP_MD5SIG
 659         struct tcp_md5sig_key *key = NULL;
 660         const __u8 *hash_location = NULL;
 661         unsigned char newhash[16];
 662         int genhash;
 663         struct sock *sk1 = NULL;
 664 #endif
 665         u64 transmit_time = 0;
 666         struct sock *ctl_sk;
 667         struct net *net;
 668
 669         /* Never send a reset in response to a reset. */
 670         if (th->rst)
 671                 return;
 672
 673         /* If sk not NULL, it means we did a successful lookup and incoming
 674          * route had to be correct. prequeue might have dropped our dst.
 675          */
 676         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                 return;
 678
 679         /* Swap the send and the receive. */
 680         memset(&rep, 0, sizeof(rep));
 681         rep.th.dest   = th->source;
 682         rep.th.source = th->dest;
 683         rep.th.doff   = sizeof(struct tcphdr) / 4;
 684         rep.th.rst    = 1;
 685
 686         if (th->ack) {
 687                 rep.th.seq = th->ack_seq;
 688         } else {
 689                 rep.th.ack = 1;
 690                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                        skb->len - (th->doff << 2));
 692         }
 693
 694         memset(&arg, 0, sizeof(arg));
 695         arg.iov[0].iov_base = (unsigned char *)&rep;
 696         arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699 #ifdef CONFIG_TCP_MD5SIG
 700         rcu_read_lock();
 701         hash_location = tcp_parse_md5sig_option(th);
 702         if (sk && sk_fullsock(sk)) {
 703                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                         &ip_hdr(skb)->saddr, AF_INET);
 705         } else if (hash_location) {
 706                 /*
 707                  * active side is lost. Try to find listening socket through
 708                  * source port, and then find md5 key through listening socket.
 709                  * we are not loose security here:
 710                  * Incoming packet is checked with md5 hash with finding key,
 711                  * no RST generated if md5 hash doesn't match.
 712                  */
 713                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                              ip_hdr(skb)->saddr,
 715                                              th->source, ip_hdr(skb)->daddr,
 716                                              ntohs(th->source), inet_iif(skb),
 717                                              tcp_v4_sdif(skb));
 718                 /* don't send rst if it can't find key */
 719                 if (!sk1)
 720                         goto out;
 721
 722                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                         &ip_hdr(skb)->saddr, AF_INET);
 724                 if (!key)
 725                         goto out;
 726
 727
 728                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                         goto out;
 731
 732         }
 733
 734         if (key) {
 735                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                    (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_MD5SIG << 8) |
 738                                    TCPOLEN_MD5SIG);
 739                 /* Update length and the length the header thinks exists */
 740                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                 rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                      key, ip_hdr(skb)->saddr,
 745                                      ip_hdr(skb)->daddr, &rep.th);
 746         }
 747 #endif
 748         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                       ip_hdr(skb)->saddr, /* XXX */
 750                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754         /* When socket is gone, all binding information is lost.
 755          * routing might fail in this case. No choice here, if we choose to force
 756          * input interface, we will misroute in case of asymmetric route.
 757          */
 758         if (sk) {
 759                 arg.bound_dev_if = sk->sk_bound_dev_if;
 760                 if (sk_fullsock(sk))
 761                         trace_tcp_send_reset(sk, skb);
 762         }
 763
 764         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767         arg.tos = ip_hdr(skb)->tos;
 768         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769         local_bh_disable();
 770         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 771         if (sk) {
 772                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 774                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 775                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 776                 transmit_time = tcp_transmit_time(sk);
 777         }
 778         ip_send_unicast_reply(ctl_sk,
 779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 781                               &arg, arg.iov[0].iov_len,
 782                               transmit_time);
 783
 784         ctl_sk->sk_mark = 0;
 785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 786         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 787         local_bh_enable();
 788
 789 #ifdef CONFIG_TCP_MD5SIG
 790 out:
 791         rcu_read_unlock();
 792 #endif
 793 }
 794
 795 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 796    outside socket context is ugly, certainly. What can I do?
 797  */
 798
 799 static void tcp_v4_send_ack(const struct sock *sk,
 800                             struct sk_buff *skb, u32 seq, u32 ack,
 801                             u32 win, u32 tsval, u32 tsecr, int oif,
 802                             struct tcp_md5sig_key *key,
 803                             int reply_flags, u8 tos)
 804 {
 805         const struct tcphdr *th = tcp_hdr(skb);
 806         struct {
 807                 struct tcphdr th;
 808                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 809 #ifdef CONFIG_TCP_MD5SIG
 810                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 811 #endif
 812                         ];
 813         } rep;
 814         struct net *net = sock_net(sk);
 815         struct ip_reply_arg arg;
 816         struct sock *ctl_sk;
 817         u64 transmit_time;
 818
 819         memset(&rep.th, 0, sizeof(struct tcphdr));
 820         memset(&arg, 0, sizeof(arg));
 821
 822         arg.iov[0].iov_base = (unsigned char *)&rep;
 823         arg.iov[0].iov_len  = sizeof(rep.th);
 824         if (tsecr) {
 825                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 826                                    (TCPOPT_TIMESTAMP << 8) |
 827                                    TCPOLEN_TIMESTAMP);
 828                 rep.opt[1] = htonl(tsval);
 829                 rep.opt[2] = htonl(tsecr);
 830                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 831         }
 832
 833         /* Swap the send and the receive. */
 834         rep.th.dest    = th->source;
 835         rep.th.source  = th->dest;
 836         rep.th.doff    = arg.iov[0].iov_len / 4;
 837         rep.th.seq     = htonl(seq);
 838         rep.th.ack_seq = htonl(ack);
 839         rep.th.ack     = 1;
 840         rep.th.window  = htons(win);
 841
 842 #ifdef CONFIG_TCP_MD5SIG
 843         if (key) {
 844                 int offset = (tsecr) ? 3 : 0;
 845
 846                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 847                                           (TCPOPT_NOP << 16) |
 848                                           (TCPOPT_MD5SIG << 8) |
 849                                           TCPOLEN_MD5SIG);
 850                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 851                 rep.th.doff = arg.iov[0].iov_len/4;
 852
 853                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 854                                     key, ip_hdr(skb)->saddr,
 855                                     ip_hdr(skb)->daddr, &rep.th);
 856         }
 857 #endif
 858         arg.flags = reply_flags;
 859         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 860                                       ip_hdr(skb)->saddr, /* XXX */
 861                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 862         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 863         if (oif)
 864                 arg.bound_dev_if = oif;
 865         arg.tos = tos;
 866         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 867         local_bh_disable();
 868         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 869         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 870                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 871         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 872                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 873         transmit_time = tcp_transmit_time(sk);
 874         ip_send_unicast_reply(ctl_sk,
 875                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 876                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 877                               &arg, arg.iov[0].iov_len,
 878                               transmit_time);
 879
 880         ctl_sk->sk_mark = 0;
 881         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 882         local_bh_enable();
 883 }
 884
 885 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 886 {
 887         struct inet_timewait_sock *tw = inet_twsk(sk);
 888         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 889
 890         tcp_v4_send_ack(sk, skb,
 891                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 892                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 893                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 894                         tcptw->tw_ts_recent,
 895                         tw->tw_bound_dev_if,
 896                         tcp_twsk_md5_key(tcptw),
 897                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 898                         tw->tw_tos
 899                         );
 900
 901         inet_twsk_put(tw);
 902 }
 903
 904 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 905                                   struct request_sock *req)
 906 {
 907         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 908          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 909          */
 910         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 911                                              tcp_sk(sk)->snd_nxt;
 912
 913         /* RFC 7323 2.3
 914          * The window field (SEG.WND) of every outgoing segment, with the
 915          * exception of <SYN> segments, MUST be right-shifted by
 916          * Rcv.Wind.Shift bits:
 917          */
 918         tcp_v4_send_ack(sk, skb, seq,
 919                         tcp_rsk(req)->rcv_nxt,
 920                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 921                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 922                         req->ts_recent,
 923                         0,
 924                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 925                                           AF_INET),
 926                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 927                         ip_hdr(skb)->tos);
 928 }
 929
 930 /*
 931  *      Send a SYN-ACK after having received a SYN.
 932  *      This still operates on a request_sock only, not on a big
 933  *      socket.
 934  */
 935 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 936                               struct flowi *fl,
 937                               struct request_sock *req,
 938                               struct tcp_fastopen_cookie *foc,
 939                               enum tcp_synack_type synack_type)
 940 {
 941         const struct inet_request_sock *ireq = inet_rsk(req);
 942         struct flowi4 fl4;
 943         int err = -1;
 944         struct sk_buff *skb;
 945
 946         /* First, grab a route. */
 947         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 948                 return -1;
 949
 950         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 951
 952         if (skb) {
 953                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 954
 955                 rcu_read_lock();
 956                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 957                                             ireq->ir_rmt_addr,
 958                                             rcu_dereference(ireq->ireq_opt));
 959                 rcu_read_unlock();
 960                 err = net_xmit_eval(err);
 961         }
 962
 963         return err;
 964 }
 965
 966 /*
 967  *      IPv4 request_sock destructor.
 968  */
 969 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 970 {
 971         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 972 }
 973
 974 #ifdef CONFIG_TCP_MD5SIG
 975 /*
 976  * RFC2385 MD5 checksumming requires a mapping of
 977  * IP address->MD5 Key.
 978  * We need to maintain these in the sk structure.
 979  */
 980
 981 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 982 EXPORT_SYMBOL(tcp_md5_needed);
 983
 984 /* Find the Key structure for an address.  */
 985 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 986                                            const union tcp_md5_addr *addr,
 987                                            int family)
 988 {
 989         const struct tcp_sock *tp = tcp_sk(sk);
 990         struct tcp_md5sig_key *key;
 991         const struct tcp_md5sig_info *md5sig;
 992         __be32 mask;
 993         struct tcp_md5sig_key *best_match = NULL;
 994         bool match;
 995
 996         /* caller either holds rcu_read_lock() or socket lock */
 997         md5sig = rcu_dereference_check(tp->md5sig_info,
 998                                        lockdep_sock_is_held(sk));
 999         if (!md5sig)
1000                 return NULL;
1001
1002         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1003                 if (key->family != family)
1004                         continue;
1005
1006                 if (family == AF_INET) {
1007                         mask = inet_make_mask(key->prefixlen);
1008                         match = (key->addr.a4.s_addr & mask) ==
1009                                 (addr->a4.s_addr & mask);
1010 #if IS_ENABLED(CONFIG_IPV6)
1011                 } else if (family == AF_INET6) {
1012                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1013                                                   key->prefixlen);
1014 #endif
1015                 } else {
1016                         match = false;
1017                 }
1018
1019                 if (match && (!best_match ||
1020                               key->prefixlen > best_match->prefixlen))
1021                         best_match = key;
1022         }
1023         return best_match;
1024 }
1025 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1026
1027 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1028                                                       const union tcp_md5_addr *addr,
1029                                                       int family, u8 prefixlen)
1030 {
1031         const struct tcp_sock *tp = tcp_sk(sk);
1032         struct tcp_md5sig_key *key;
1033         unsigned int size = sizeof(struct in_addr);
1034         const struct tcp_md5sig_info *md5sig;
1035
1036         /* caller either holds rcu_read_lock() or socket lock */
1037         md5sig = rcu_dereference_check(tp->md5sig_info,
1038                                        lockdep_sock_is_held(sk));
1039         if (!md5sig)
1040                 return NULL;
1041 #if IS_ENABLED(CONFIG_IPV6)
1042         if (family == AF_INET6)
1043                 size = sizeof(struct in6_addr);
1044 #endif
1045         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1046                 if (key->family != family)
1047                         continue;
1048                 if (!memcmp(&key->addr, addr, size) &&
1049                     key->prefixlen == prefixlen)
1050                         return key;
1051         }
1052         return NULL;
1053 }
1054
1055 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1056                                          const struct sock *addr_sk)
1057 {
1058         const union tcp_md5_addr *addr;
1059
1060         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1061         return tcp_md5_do_lookup(sk, addr, AF_INET);
1062 }
1063 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1064
1065 /* This can be called on a newly created socket, from other files */
1066 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1067                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1068                    gfp_t gfp)
1069 {
1070         /* Add Key to the list */
1071         struct tcp_md5sig_key *key;
1072         struct tcp_sock *tp = tcp_sk(sk);
1073         struct tcp_md5sig_info *md5sig;
1074
1075         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1076         if (key) {
1077                 /* Pre-existing entry - just update that one. */
1078                 memcpy(key->key, newkey, newkeylen);
1079                 key->keylen = newkeylen;
1080                 return 0;
1081         }
1082
1083         md5sig = rcu_dereference_protected(tp->md5sig_info,
1084                                            lockdep_sock_is_held(sk));
1085         if (!md5sig) {
1086                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1087                 if (!md5sig)
1088                         return -ENOMEM;
1089
1090                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1091                 INIT_HLIST_HEAD(&md5sig->head);
1092                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1093         }
1094
1095         key = sock_kmalloc(sk, sizeof(*key), gfp);
1096         if (!key)
1097                 return -ENOMEM;
1098         if (!tcp_alloc_md5sig_pool()) {
1099                 sock_kfree_s(sk, key, sizeof(*key));
1100                 return -ENOMEM;
1101         }
1102
1103         memcpy(key->key, newkey, newkeylen);
1104         key->keylen = newkeylen;
1105         key->family = family;
1106         key->prefixlen = prefixlen;
1107         memcpy(&key->addr, addr,
1108                (family == AF_INET6) ? sizeof(struct in6_addr) :
1109                                       sizeof(struct in_addr));
1110         hlist_add_head_rcu(&key->node, &md5sig->head);
1111         return 0;
1112 }
1113 EXPORT_SYMBOL(tcp_md5_do_add);
1114
1115 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1116                    u8 prefixlen)
1117 {
1118         struct tcp_md5sig_key *key;
1119
1120         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1121         if (!key)
1122                 return -ENOENT;
1123         hlist_del_rcu(&key->node);
1124         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1125         kfree_rcu(key, rcu);
1126         return 0;
1127 }
1128 EXPORT_SYMBOL(tcp_md5_do_del);
1129
1130 static void tcp_clear_md5_list(struct sock *sk)
1131 {
1132         struct tcp_sock *tp = tcp_sk(sk);
1133         struct tcp_md5sig_key *key;
1134         struct hlist_node *n;
1135         struct tcp_md5sig_info *md5sig;
1136
1137         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1138
1139         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1140                 hlist_del_rcu(&key->node);
1141                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1142                 kfree_rcu(key, rcu);
1143         }
1144 }
1145
1146 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1147                                  char __user *optval, int optlen)
1148 {
1149         struct tcp_md5sig cmd;
1150         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1151         u8 prefixlen = 32;
1152
1153         if (optlen < sizeof(cmd))
1154                 return -EINVAL;
1155
1156         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1157                 return -EFAULT;
1158
1159         if (sin->sin_family != AF_INET)
1160                 return -EINVAL;
1161
1162         if (optname == TCP_MD5SIG_EXT &&
1163             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1164                 prefixlen = cmd.tcpm_prefixlen;
1165                 if (prefixlen > 32)
1166                         return -EINVAL;
1167         }
1168
1169         if (!cmd.tcpm_keylen)
1170                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1171                                       AF_INET, prefixlen);
1172
1173         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1174                 return -EINVAL;
1175
1176         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1177                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1178                               GFP_KERNEL);
1179 }
1180
1181 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1182                                    __be32 daddr, __be32 saddr,
1183                                    const struct tcphdr *th, int nbytes)
1184 {
1185         struct tcp4_pseudohdr *bp;
1186         struct scatterlist sg;
1187         struct tcphdr *_th;
1188
1189         bp = hp->scratch;
1190         bp->saddr = saddr;
1191         bp->daddr = daddr;
1192         bp->pad = 0;
1193         bp->protocol = IPPROTO_TCP;
1194         bp->len = cpu_to_be16(nbytes);
1195
1196         _th = (struct tcphdr *)(bp + 1);
1197         memcpy(_th, th, sizeof(*th));
1198         _th->check = 0;
1199
1200         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1201         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1202                                 sizeof(*bp) + sizeof(*th));
1203         return crypto_ahash_update(hp->md5_req);
1204 }
1205
1206 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1207                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1208 {
1209         struct tcp_md5sig_pool *hp;
1210         struct ahash_request *req;
1211
1212         hp = tcp_get_md5sig_pool();
1213         if (!hp)
1214                 goto clear_hash_noput;
1215         req = hp->md5_req;
1216
1217         if (crypto_ahash_init(req))
1218                 goto clear_hash;
1219         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1220                 goto clear_hash;
1221         if (tcp_md5_hash_key(hp, key))
1222                 goto clear_hash;
1223         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1224         if (crypto_ahash_final(req))
1225                 goto clear_hash;
1226
1227         tcp_put_md5sig_pool();
1228         return 0;
1229
1230 clear_hash:
1231         tcp_put_md5sig_pool();
1232 clear_hash_noput:
1233         memset(md5_hash, 0, 16);
1234         return 1;
1235 }
1236
1237 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1238                         const struct sock *sk,
1239                         const struct sk_buff *skb)
1240 {
1241         struct tcp_md5sig_pool *hp;
1242         struct ahash_request *req;
1243         const struct tcphdr *th = tcp_hdr(skb);
1244         __be32 saddr, daddr;
1245
1246         if (sk) { /* valid for establish/request sockets */
1247                 saddr = sk->sk_rcv_saddr;
1248                 daddr = sk->sk_daddr;
1249         } else {
1250                 const struct iphdr *iph = ip_hdr(skb);
1251                 saddr = iph->saddr;
1252                 daddr = iph->daddr;
1253         }
1254
1255         hp = tcp_get_md5sig_pool();
1256         if (!hp)
1257                 goto clear_hash_noput;
1258         req = hp->md5_req;
1259
1260         if (crypto_ahash_init(req))
1261                 goto clear_hash;
1262
1263         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1264                 goto clear_hash;
1265         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1266                 goto clear_hash;
1267         if (tcp_md5_hash_key(hp, key))
1268                 goto clear_hash;
1269         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1270         if (crypto_ahash_final(req))
1271                 goto clear_hash;
1272
1273         tcp_put_md5sig_pool();
1274         return 0;
1275
1276 clear_hash:
1277         tcp_put_md5sig_pool();
1278 clear_hash_noput:
1279         memset(md5_hash, 0, 16);
1280         return 1;
1281 }
1282 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1283
1284 #endif
1285
1286 /* Called with rcu_read_lock() */
1287 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1288                                     const struct sk_buff *skb)
1289 {
1290 #ifdef CONFIG_TCP_MD5SIG
1291         /*
1292          * This gets called for each TCP segment that arrives
1293          * so we want to be efficient.
1294          * We have 3 drop cases:
1295          * o No MD5 hash and one expected.
1296          * o MD5 hash and we're not expecting one.
1297          * o MD5 hash and its wrong.
1298          */
1299         const __u8 *hash_location = NULL;
1300         struct tcp_md5sig_key *hash_expected;
1301         const struct iphdr *iph = ip_hdr(skb);
1302         const struct tcphdr *th = tcp_hdr(skb);
1303         int genhash;
1304         unsigned char newhash[16];
1305
1306         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1307                                           AF_INET);
1308         hash_location = tcp_parse_md5sig_option(th);
1309
1310         /* We've parsed the options - do we have a hash? */
1311         if (!hash_expected && !hash_location)
1312                 return false;
1313
1314         if (hash_expected && !hash_location) {
1315                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1316                 return true;
1317         }
1318
1319         if (!hash_expected && hash_location) {
1320                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1321                 return true;
1322         }
1323
1324         /* Okay, so this is hash_expected and hash_location -
1325          * so we need to calculate the checksum.
1326          */
1327         genhash = tcp_v4_md5_hash_skb(newhash,
1328                                       hash_expected,
1329                                       NULL, skb);
1330
1331         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1332                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1333                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1334                                      &iph->saddr, ntohs(th->source),
1335                                      &iph->daddr, ntohs(th->dest),
1336                                      genhash ? " tcp_v4_calc_md5_hash failed"
1337                                      : "");
1338                 return true;
1339         }
1340         return false;
1341 #endif
1342         return false;
1343 }
1344
1345 static void tcp_v4_init_req(struct request_sock *req,
1346                             const struct sock *sk_listener,
1347                             struct sk_buff *skb)
1348 {
1349         struct inet_request_sock *ireq = inet_rsk(req);
1350         struct net *net = sock_net(sk_listener);
1351
1352         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1353         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1354         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1355 }
1356
1357 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1358                                           struct flowi *fl,
1359                                           const struct request_sock *req)
1360 {
1361         return inet_csk_route_req(sk, &fl->u.ip4, req);
1362 }
1363
1364 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1365         .family         =       PF_INET,
1366         .obj_size       =       sizeof(struct tcp_request_sock),
1367         .rtx_syn_ack    =       tcp_rtx_synack,
1368         .send_ack       =       tcp_v4_reqsk_send_ack,
1369         .destructor     =       tcp_v4_reqsk_destructor,
1370         .send_reset     =       tcp_v4_send_reset,
1371         .syn_ack_timeout =      tcp_syn_ack_timeout,
1372 };
1373
1374 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1375         .mss_clamp      =       TCP_MSS_DEFAULT,
1376 #ifdef CONFIG_TCP_MD5SIG
1377         .req_md5_lookup =       tcp_v4_md5_lookup,
1378         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1379 #endif
1380         .init_req       =       tcp_v4_init_req,
1381 #ifdef CONFIG_SYN_COOKIES
1382         .cookie_init_seq =      cookie_v4_init_sequence,
1383 #endif
1384         .route_req      =       tcp_v4_route_req,
1385         .init_seq       =       tcp_v4_init_seq,
1386         .init_ts_off    =       tcp_v4_init_ts_off,
1387         .send_synack    =       tcp_v4_send_synack,
1388 };
1389
1390 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1391 {
1392         /* Never answer to SYNs send to broadcast or multicast */
1393         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1394                 goto drop;
1395
1396         return tcp_conn_request(&tcp_request_sock_ops,
1397                                 &tcp_request_sock_ipv4_ops, sk, skb);
1398
1399 drop:
1400         tcp_listendrop(sk);
1401         return 0;
1402 }
1403 EXPORT_SYMBOL(tcp_v4_conn_request);
1404
1405
1406 /*
1407  * The three way handshake has completed - we got a valid synack -
1408  * now create the new socket.
1409  */
1410 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1411                                   struct request_sock *req,
1412                                   struct dst_entry *dst,
1413                                   struct request_sock *req_unhash,
1414                                   bool *own_req)
1415 {
1416         struct inet_request_sock *ireq;
1417         struct inet_sock *newinet;
1418         struct tcp_sock *newtp;
1419         struct sock *newsk;
1420 #ifdef CONFIG_TCP_MD5SIG
1421         struct tcp_md5sig_key *key;
1422 #endif
1423         struct ip_options_rcu *inet_opt;
1424
1425         if (sk_acceptq_is_full(sk))
1426                 goto exit_overflow;
1427
1428         newsk = tcp_create_openreq_child(sk, req, skb);
1429         if (!newsk)
1430                 goto exit_nonewsk;
1431
1432         newsk->sk_gso_type = SKB_GSO_TCPV4;
1433         inet_sk_rx_dst_set(newsk, skb);
1434
1435         newtp                 = tcp_sk(newsk);
1436         newinet               = inet_sk(newsk);
1437         ireq                  = inet_rsk(req);
1438         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1439         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1440         newsk->sk_bound_dev_if = ireq->ir_iif;
1441         newinet->inet_saddr   = ireq->ir_loc_addr;
1442         inet_opt              = rcu_dereference(ireq->ireq_opt);
1443         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1444         newinet->mc_index     = inet_iif(skb);
1445         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1446         newinet->rcv_tos      = ip_hdr(skb)->tos;
1447         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1448         if (inet_opt)
1449                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1450         newinet->inet_id = newtp->write_seq ^ jiffies;
1451
1452         if (!dst) {
1453                 dst = inet_csk_route_child_sock(sk, newsk, req);
1454                 if (!dst)
1455                         goto put_and_exit;
1456         } else {
1457                 /* syncookie case : see end of cookie_v4_check() */
1458         }
1459         sk_setup_caps(newsk, dst);
1460
1461         tcp_ca_openreq_child(newsk, dst);
1462
1463         tcp_sync_mss(newsk, dst_mtu(dst));
1464         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1465
1466         tcp_initialize_rcv_mss(newsk);
1467
1468 #ifdef CONFIG_TCP_MD5SIG
1469         /* Copy over the MD5 key from the original socket */
1470         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1471                                 AF_INET);
1472         if (key) {
1473                 /*
1474                  * We're using one, so create a matching key
1475                  * on the newsk structure. If we fail to get
1476                  * memory, then we end up not copying the key
1477                  * across. Shucks.
1478                  */
1479                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1480                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1481                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1482         }
1483 #endif
1484
1485         if (__inet_inherit_port(sk, newsk) < 0)
1486                 goto put_and_exit;
1487         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1488         if (likely(*own_req)) {
1489                 tcp_move_syn(newtp, req);
1490                 ireq->ireq_opt = NULL;
1491         } else {
1492                 newinet->inet_opt = NULL;
1493         }
1494         return newsk;
1495
1496 exit_overflow:
1497         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1498 exit_nonewsk:
1499         dst_release(dst);
1500 exit:
1501         tcp_listendrop(sk);
1502         return NULL;
1503 put_and_exit:
1504         newinet->inet_opt = NULL;
1505         inet_csk_prepare_forced_close(newsk);
1506         tcp_done(newsk);
1507         goto exit;
1508 }
1509 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1510
1511 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1512 {
1513 #ifdef CONFIG_SYN_COOKIES
1514         const struct tcphdr *th = tcp_hdr(skb);
1515
1516         if (!th->syn)
1517                 sk = cookie_v4_check(sk, skb);
1518 #endif
1519         return sk;
1520 }
1521
1522 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1523                          struct tcphdr *th, u32 *cookie)
1524 {
1525         u16 mss = 0;
1526 #ifdef CONFIG_SYN_COOKIES
1527         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1528                                     &tcp_request_sock_ipv4_ops, sk, th);
1529         if (mss) {
1530                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1531                 tcp_synq_overflow(sk);
1532         }
1533 #endif
1534         return mss;
1535 }
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here, unless it is a TCP_LISTEN socket.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548
1549         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1550                 struct dst_entry *dst = sk->sk_rx_dst;
1551
1552                 sock_rps_save_rxhash(sk, skb);
1553                 sk_mark_napi_id(sk, skb);
1554                 if (dst) {
1555                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1556                             !dst->ops->check(dst, 0)) {
1557                                 dst_release(dst);
1558                                 sk->sk_rx_dst = NULL;
1559                         }
1560                 }
1561                 tcp_rcv_established(sk, skb);
1562                 return 0;
1563         }
1564
1565         if (tcp_checksum_complete(skb))
1566                 goto csum_err;
1567
1568         if (sk->sk_state == TCP_LISTEN) {
1569                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1570
1571                 if (!nsk)
1572                         goto discard;
1573                 if (nsk != sk) {
1574                         if (tcp_child_process(sk, nsk, skb)) {
1575                                 rsk = nsk;
1576                                 goto reset;
1577                         }
1578                         return 0;
1579                 }
1580         } else
1581                 sock_rps_save_rxhash(sk, skb);
1582
1583         if (tcp_rcv_state_process(sk, skb)) {
1584                 rsk = sk;
1585                 goto reset;
1586         }
1587         return 0;
1588
1589 reset:
1590         tcp_v4_send_reset(rsk, skb);
1591 discard:
1592         kfree_skb(skb);
1593         /* Be careful here. If this function gets more complicated and
1594          * gcc suffers from register pressure on the x86, sk (in %ebx)
1595          * might be destroyed here. This current version compiles correctly,
1596          * but you have been warned.
1597          */
1598         return 0;
1599
1600 csum_err:
1601         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1602         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1603         goto discard;
1604 }
1605 EXPORT_SYMBOL(tcp_v4_do_rcv);
1606
1607 int tcp_v4_early_demux(struct sk_buff *skb)
1608 {
1609         const struct iphdr *iph;
1610         const struct tcphdr *th;
1611         struct sock *sk;
1612
1613         if (skb->pkt_type != PACKET_HOST)
1614                 return 0;
1615
1616         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1617                 return 0;
1618
1619         iph = ip_hdr(skb);
1620         th = tcp_hdr(skb);
1621
1622         if (th->doff < sizeof(struct tcphdr) / 4)
1623                 return 0;
1624
1625         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1626                                        iph->saddr, th->source,
1627                                        iph->daddr, ntohs(th->dest),
1628                                        skb->skb_iif, inet_sdif(skb));
1629         if (sk) {
1630                 skb->sk = sk;
1631                 skb->destructor = sock_edemux;
1632                 if (sk_fullsock(sk)) {
1633                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1634
1635                         if (dst)
1636                                 dst = dst_check(dst, 0);
1637                         if (dst &&
1638                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1639                                 skb_dst_set_noref(skb, dst);
1640                 }
1641         }
1642         return 0;
1643 }
1644
1645 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1646 {
1647         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1648         struct skb_shared_info *shinfo;
1649         const struct tcphdr *th;
1650         struct tcphdr *thtail;
1651         struct sk_buff *tail;
1652         unsigned int hdrlen;
1653         bool fragstolen;
1654         u32 gso_segs;
1655         int delta;
1656
1657         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1658          * we can fix skb->truesize to its real value to avoid future drops.
1659          * This is valid because skb is not yet charged to the socket.
1660          * It has been noticed pure SACK packets were sometimes dropped
1661          * (if cooked by drivers without copybreak feature).
1662          */
1663         skb_condense(skb);
1664
1665         skb_dst_drop(skb);
1666
1667         if (unlikely(tcp_checksum_complete(skb))) {
1668                 bh_unlock_sock(sk);
1669                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1670                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1671                 return true;
1672         }
1673
1674         /* Attempt coalescing to last skb in backlog, even if we are
1675          * above the limits.
1676          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1677          */
1678         th = (const struct tcphdr *)skb->data;
1679         hdrlen = th->doff * 4;
1680         shinfo = skb_shinfo(skb);
1681
1682         if (!shinfo->gso_size)
1683                 shinfo->gso_size = skb->len - hdrlen;
1684
1685         if (!shinfo->gso_segs)
1686                 shinfo->gso_segs = 1;
1687
1688         tail = sk->sk_backlog.tail;
1689         if (!tail)
1690                 goto no_coalesce;
1691         thtail = (struct tcphdr *)tail->data;
1692
1693         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1694             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1695             ((TCP_SKB_CB(tail)->tcp_flags |
1696               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1697             !((TCP_SKB_CB(tail)->tcp_flags &
1698               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1699             ((TCP_SKB_CB(tail)->tcp_flags ^
1700               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1701 #ifdef CONFIG_TLS_DEVICE
1702             tail->decrypted != skb->decrypted ||
1703 #endif
1704             thtail->doff != th->doff ||
1705             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1706                 goto no_coalesce;
1707
1708         __skb_pull(skb, hdrlen);
1709         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1710                 thtail->window = th->window;
1711
1712                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1713
1714                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1715                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1716
1717                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1718                  * thtail->fin, so that the fast path in tcp_rcv_established()
1719                  * is not entered if we append a packet with a FIN.
1720                  * SYN, RST, URG are not present.
1721                  * ACK is set on both packets.
1722                  * PSH : we do not really care in TCP stack,
1723                  *       at least for 'GRO' packets.
1724                  */
1725                 thtail->fin |= th->fin;
1726                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1727
1728                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1729                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1730                         tail->tstamp = skb->tstamp;
1731                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1732                 }
1733
1734                 /* Not as strict as GRO. We only need to carry mss max value */
1735                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1736                                                  skb_shinfo(tail)->gso_size);
1737
1738                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1739                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1740
1741                 sk->sk_backlog.len += delta;
1742                 __NET_INC_STATS(sock_net(sk),
1743                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1744                 kfree_skb_partial(skb, fragstolen);
1745                 return false;
1746         }
1747         __skb_push(skb, hdrlen);
1748
1749 no_coalesce:
1750         /* Only socket owner can try to collapse/prune rx queues
1751          * to reduce memory overhead, so add a little headroom here.
1752          * Few sockets backlog are possibly concurrently non empty.
1753          */
1754         limit += 64*1024;
1755
1756         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1757                 bh_unlock_sock(sk);
1758                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1759                 return true;
1760         }
1761         return false;
1762 }
1763 EXPORT_SYMBOL(tcp_add_backlog);
1764
1765 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1766 {
1767         struct tcphdr *th = (struct tcphdr *)skb->data;
1768
1769         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1770 }
1771 EXPORT_SYMBOL(tcp_filter);
1772
1773 static void tcp_v4_restore_cb(struct sk_buff *skb)
1774 {
1775         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1776                 sizeof(struct inet_skb_parm));
1777 }
1778
1779 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1780                            const struct tcphdr *th)
1781 {
1782         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1783          * barrier() makes sure compiler wont play fool^Waliasing games.
1784          */
1785         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1786                 sizeof(struct inet_skb_parm));
1787         barrier();
1788
1789         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1790         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1791                                     skb->len - th->doff * 4);
1792         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1793         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1794         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1795         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1796         TCP_SKB_CB(skb)->sacked  = 0;
1797         TCP_SKB_CB(skb)->has_rxtstamp =
1798                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1799 }
1800
1801 /*
1802  *      From tcp_input.c
1803  */
1804
1805 int tcp_v4_rcv(struct sk_buff *skb)
1806 {
1807         struct net *net = dev_net(skb->dev);
1808         struct sk_buff *skb_to_free;
1809         int sdif = inet_sdif(skb);
1810         const struct iphdr *iph;
1811         const struct tcphdr *th;
1812         bool refcounted;
1813         struct sock *sk;
1814         int ret;
1815
1816         if (skb->pkt_type != PACKET_HOST)
1817                 goto discard_it;
1818
1819         /* Count it even if it's bad */
1820         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1821
1822         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1823                 goto discard_it;
1824
1825         th = (const struct tcphdr *)skb->data;
1826
1827         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1828                 goto bad_packet;
1829         if (!pskb_may_pull(skb, th->doff * 4))
1830                 goto discard_it;
1831
1832         /* An explanation is required here, I think.
1833          * Packet length and doff are validated by header prediction,
1834          * provided case of th->doff==0 is eliminated.
1835          * So, we defer the checks. */
1836
1837         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1838                 goto csum_error;
1839
1840         th = (const struct tcphdr *)skb->data;
1841         iph = ip_hdr(skb);
1842 lookup:
1843         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1844                                th->dest, sdif, &refcounted);
1845         if (!sk)
1846                 goto no_tcp_socket;
1847
1848 process:
1849         if (sk->sk_state == TCP_TIME_WAIT)
1850                 goto do_time_wait;
1851
1852         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1853                 struct request_sock *req = inet_reqsk(sk);
1854                 bool req_stolen = false;
1855                 struct sock *nsk;
1856
1857                 sk = req->rsk_listener;
1858                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1859                         sk_drops_add(sk, skb);
1860                         reqsk_put(req);
1861                         goto discard_it;
1862                 }
1863                 if (tcp_checksum_complete(skb)) {
1864                         reqsk_put(req);
1865                         goto csum_error;
1866                 }
1867                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1868                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1869                         goto lookup;
1870                 }
1871                 /* We own a reference on the listener, increase it again
1872                  * as we might lose it too soon.
1873                  */
1874                 sock_hold(sk);
1875                 refcounted = true;
1876                 nsk = NULL;
1877                 if (!tcp_filter(sk, skb)) {
1878                         th = (const struct tcphdr *)skb->data;
1879                         iph = ip_hdr(skb);
1880                         tcp_v4_fill_cb(skb, iph, th);
1881                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1882                 }
1883                 if (!nsk) {
1884                         reqsk_put(req);
1885                         if (req_stolen) {
1886                                 /* Another cpu got exclusive access to req
1887                                  * and created a full blown socket.
1888                                  * Try to feed this packet to this socket
1889                                  * instead of discarding it.
1890                                  */
1891                                 tcp_v4_restore_cb(skb);
1892                                 sock_put(sk);
1893                                 goto lookup;
1894                         }
1895                         goto discard_and_relse;
1896                 }
1897                 if (nsk == sk) {
1898                         reqsk_put(req);
1899                         tcp_v4_restore_cb(skb);
1900                 } else if (tcp_child_process(sk, nsk, skb)) {
1901                         tcp_v4_send_reset(nsk, skb);
1902                         goto discard_and_relse;
1903                 } else {
1904                         sock_put(sk);
1905                         return 0;
1906                 }
1907         }
1908         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1909                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1910                 goto discard_and_relse;
1911         }
1912
1913         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1914                 goto discard_and_relse;
1915
1916         if (tcp_v4_inbound_md5_hash(sk, skb))
1917                 goto discard_and_relse;
1918
1919         nf_reset(skb);
1920
1921         if (tcp_filter(sk, skb))
1922                 goto discard_and_relse;
1923         th = (const struct tcphdr *)skb->data;
1924         iph = ip_hdr(skb);
1925         tcp_v4_fill_cb(skb, iph, th);
1926
1927         skb->dev = NULL;
1928
1929         if (sk->sk_state == TCP_LISTEN) {
1930                 ret = tcp_v4_do_rcv(sk, skb);
1931                 goto put_and_return;
1932         }
1933
1934         sk_incoming_cpu_update(sk);
1935
1936         bh_lock_sock_nested(sk);
1937         tcp_segs_in(tcp_sk(sk), skb);
1938         ret = 0;
1939         if (!sock_owned_by_user(sk)) {
1940                 skb_to_free = sk->sk_rx_skb_cache;
1941                 sk->sk_rx_skb_cache = NULL;
1942                 ret = tcp_v4_do_rcv(sk, skb);
1943         } else {
1944                 if (tcp_add_backlog(sk, skb))
1945                         goto discard_and_relse;
1946                 skb_to_free = NULL;
1947         }
1948         bh_unlock_sock(sk);
1949         if (skb_to_free)
1950                 __kfree_skb(skb_to_free);
1951
1952 put_and_return:
1953         if (refcounted)
1954                 sock_put(sk);
1955
1956         return ret;
1957
1958 no_tcp_socket:
1959         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1960                 goto discard_it;
1961
1962         tcp_v4_fill_cb(skb, iph, th);
1963
1964         if (tcp_checksum_complete(skb)) {
1965 csum_error:
1966                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1967 bad_packet:
1968                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1969         } else {
1970                 tcp_v4_send_reset(NULL, skb);
1971         }
1972
1973 discard_it:
1974         /* Discard frame. */
1975         kfree_skb(skb);
1976         return 0;
1977
1978 discard_and_relse:
1979         sk_drops_add(sk, skb);
1980         if (refcounted)
1981                 sock_put(sk);
1982         goto discard_it;
1983
1984 do_time_wait:
1985         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1986                 inet_twsk_put(inet_twsk(sk));
1987                 goto discard_it;
1988         }
1989
1990         tcp_v4_fill_cb(skb, iph, th);
1991
1992         if (tcp_checksum_complete(skb)) {
1993                 inet_twsk_put(inet_twsk(sk));
1994                 goto csum_error;
1995         }
1996         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1997         case TCP_TW_SYN: {
1998                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1999                                                         &tcp_hashinfo, skb,
2000                                                         __tcp_hdrlen(th),
2001                                                         iph->saddr, th->source,
2002                                                         iph->daddr, th->dest,
2003                                                         inet_iif(skb),
2004                                                         sdif);
2005                 if (sk2) {
2006                         inet_twsk_deschedule_put(inet_twsk(sk));
2007                         sk = sk2;
2008                         tcp_v4_restore_cb(skb);
2009                         refcounted = false;
2010                         goto process;
2011                 }
2012         }
2013                 /* to ACK */
2014                 /* fall through */
2015         case TCP_TW_ACK:
2016                 tcp_v4_timewait_ack(sk, skb);
2017                 break;
2018         case TCP_TW_RST:
2019                 tcp_v4_send_reset(sk, skb);
2020                 inet_twsk_deschedule_put(inet_twsk(sk));
2021                 goto discard_it;
2022         case TCP_TW_SUCCESS:;
2023         }
2024         goto discard_it;
2025 }
2026
2027 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2028         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2029         .twsk_unique    = tcp_twsk_unique,
2030         .twsk_destructor= tcp_twsk_destructor,
2031 };
2032
2033 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2034 {
2035         struct dst_entry *dst = skb_dst(skb);
2036
2037         if (dst && dst_hold_safe(dst)) {
2038                 sk->sk_rx_dst = dst;
2039                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2040         }
2041 }
2042 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2043
2044 const struct inet_connection_sock_af_ops ipv4_specific = {
2045         .queue_xmit        = ip_queue_xmit,
2046         .send_check        = tcp_v4_send_check,
2047         .rebuild_header    = inet_sk_rebuild_header,
2048         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2049         .conn_request      = tcp_v4_conn_request,
2050         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2051         .net_header_len    = sizeof(struct iphdr),
2052         .setsockopt        = ip_setsockopt,
2053         .getsockopt        = ip_getsockopt,
2054         .addr2sockaddr     = inet_csk_addr2sockaddr,
2055         .sockaddr_len      = sizeof(struct sockaddr_in),
2056 #ifdef CONFIG_COMPAT
2057         .compat_setsockopt = compat_ip_setsockopt,
2058         .compat_getsockopt = compat_ip_getsockopt,
2059 #endif
2060         .mtu_reduced       = tcp_v4_mtu_reduced,
2061 };
2062 EXPORT_SYMBOL(ipv4_specific);
2063
2064 #ifdef CONFIG_TCP_MD5SIG
2065 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2066         .md5_lookup             = tcp_v4_md5_lookup,
2067         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2068         .md5_parse              = tcp_v4_parse_md5_keys,
2069 };
2070 #endif
2071
2072 /* NOTE: A lot of things set to zero explicitly by call to
2073  *       sk_alloc() so need not be done here.
2074  */
2075 static int tcp_v4_init_sock(struct sock *sk)
2076 {
2077         struct inet_connection_sock *icsk = inet_csk(sk);
2078
2079         tcp_init_sock(sk);
2080
2081         icsk->icsk_af_ops = &ipv4_specific;
2082
2083 #ifdef CONFIG_TCP_MD5SIG
2084         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2085 #endif
2086
2087         return 0;
2088 }
2089
2090 void tcp_v4_destroy_sock(struct sock *sk)
2091 {
2092         struct tcp_sock *tp = tcp_sk(sk);
2093
2094         trace_tcp_destroy_sock(sk);
2095
2096         tcp_clear_xmit_timers(sk);
2097
2098         tcp_cleanup_congestion_control(sk);
2099
2100         tcp_cleanup_ulp(sk);
2101
2102         /* Cleanup up the write buffer. */
2103         tcp_write_queue_purge(sk);
2104
2105         /* Check if we want to disable active TFO */
2106         tcp_fastopen_active_disable_ofo_check(sk);
2107
2108         /* Cleans up our, hopefully empty, out_of_order_queue. */
2109         skb_rbtree_purge(&tp->out_of_order_queue);
2110
2111 #ifdef CONFIG_TCP_MD5SIG
2112         /* Clean up the MD5 key list, if any */
2113         if (tp->md5sig_info) {
2114                 tcp_clear_md5_list(sk);
2115                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2116                 tp->md5sig_info = NULL;
2117         }
2118 #endif
2119
2120         /* Clean up a referenced TCP bind bucket. */
2121         if (inet_csk(sk)->icsk_bind_hash)
2122                 inet_put_port(sk);
2123
2124         BUG_ON(tp->fastopen_rsk);
2125
2126         /* If socket is aborted during connect operation */
2127         tcp_free_fastopen_req(tp);
2128         tcp_fastopen_destroy_cipher(sk);
2129         tcp_saved_syn_free(tp);
2130
2131         sk_sockets_allocated_dec(sk);
2132 }
2133 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2134
2135 #ifdef CONFIG_PROC_FS
2136 /* Proc filesystem TCP sock list dumping. */
2137
2138 /*
2139  * Get next listener socket follow cur.  If cur is NULL, get first socket
2140  * starting from bucket given in st->bucket; when st->bucket is zero the
2141  * very first socket in the hash table is returned.
2142  */
2143 static void *listening_get_next(struct seq_file *seq, void *cur)
2144 {
2145         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2146         struct tcp_iter_state *st = seq->private;
2147         struct net *net = seq_file_net(seq);
2148         struct inet_listen_hashbucket *ilb;
2149         struct sock *sk = cur;
2150
2151         if (!sk) {
2152 get_head:
2153                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2154                 spin_lock(&ilb->lock);
2155                 sk = sk_head(&ilb->head);
2156                 st->offset = 0;
2157                 goto get_sk;
2158         }
2159         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2160         ++st->num;
2161         ++st->offset;
2162
2163         sk = sk_next(sk);
2164 get_sk:
2165         sk_for_each_from(sk) {
2166                 if (!net_eq(sock_net(sk), net))
2167                         continue;
2168                 if (sk->sk_family == afinfo->family)
2169                         return sk;
2170         }
2171         spin_unlock(&ilb->lock);
2172         st->offset = 0;
2173         if (++st->bucket < INET_LHTABLE_SIZE)
2174                 goto get_head;
2175         return NULL;
2176 }
2177
2178 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2179 {
2180         struct tcp_iter_state *st = seq->private;
2181         void *rc;
2182
2183         st->bucket = 0;
2184         st->offset = 0;
2185         rc = listening_get_next(seq, NULL);
2186
2187         while (rc && *pos) {
2188                 rc = listening_get_next(seq, rc);
2189                 --*pos;
2190         }
2191         return rc;
2192 }
2193
2194 static inline bool empty_bucket(const struct tcp_iter_state *st)
2195 {
2196         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2197 }
2198
2199 /*
2200  * Get first established socket starting from bucket given in st->bucket.
2201  * If st->bucket is zero, the very first socket in the hash is returned.
2202  */
2203 static void *established_get_first(struct seq_file *seq)
2204 {
2205         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2206         struct tcp_iter_state *st = seq->private;
2207         struct net *net = seq_file_net(seq);
2208         void *rc = NULL;
2209
2210         st->offset = 0;
2211         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2212                 struct sock *sk;
2213                 struct hlist_nulls_node *node;
2214                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2215
2216                 /* Lockless fast path for the common case of empty buckets */
2217                 if (empty_bucket(st))
2218                         continue;
2219
2220                 spin_lock_bh(lock);
2221                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2222                         if (sk->sk_family != afinfo->family ||
2223                             !net_eq(sock_net(sk), net)) {
2224                                 continue;
2225                         }
2226                         rc = sk;
2227                         goto out;
2228                 }
2229                 spin_unlock_bh(lock);
2230         }
2231 out:
2232         return rc;
2233 }
2234
2235 static void *established_get_next(struct seq_file *seq, void *cur)
2236 {
2237         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2238         struct sock *sk = cur;
2239         struct hlist_nulls_node *node;
2240         struct tcp_iter_state *st = seq->private;
2241         struct net *net = seq_file_net(seq);
2242
2243         ++st->num;
2244         ++st->offset;
2245
2246         sk = sk_nulls_next(sk);
2247
2248         sk_nulls_for_each_from(sk, node) {
2249                 if (sk->sk_family == afinfo->family &&
2250                     net_eq(sock_net(sk), net))
2251                         return sk;
2252         }
2253
2254         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2255         ++st->bucket;
2256         return established_get_first(seq);
2257 }
2258
2259 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2260 {
2261         struct tcp_iter_state *st = seq->private;
2262         void *rc;
2263
2264         st->bucket = 0;
2265         rc = established_get_first(seq);
2266
2267         while (rc && pos) {
2268                 rc = established_get_next(seq, rc);
2269                 --pos;
2270         }
2271         return rc;
2272 }
2273
2274 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2275 {
2276         void *rc;
2277         struct tcp_iter_state *st = seq->private;
2278
2279         st->state = TCP_SEQ_STATE_LISTENING;
2280         rc        = listening_get_idx(seq, &pos);
2281
2282         if (!rc) {
2283                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2284                 rc        = established_get_idx(seq, pos);
2285         }
2286
2287         return rc;
2288 }
2289
2290 static void *tcp_seek_last_pos(struct seq_file *seq)
2291 {
2292         struct tcp_iter_state *st = seq->private;
2293         int offset = st->offset;
2294         int orig_num = st->num;
2295         void *rc = NULL;
2296
2297         switch (st->state) {
2298         case TCP_SEQ_STATE_LISTENING:
2299                 if (st->bucket >= INET_LHTABLE_SIZE)
2300                         break;
2301                 st->state = TCP_SEQ_STATE_LISTENING;
2302                 rc = listening_get_next(seq, NULL);
2303                 while (offset-- && rc)
2304                         rc = listening_get_next(seq, rc);
2305                 if (rc)
2306                         break;
2307                 st->bucket = 0;
2308                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2309                 /* Fallthrough */
2310         case TCP_SEQ_STATE_ESTABLISHED:
2311                 if (st->bucket > tcp_hashinfo.ehash_mask)
2312                         break;
2313                 rc = established_get_first(seq);
2314                 while (offset-- && rc)
2315                         rc = established_get_next(seq, rc);
2316         }
2317
2318         st->num = orig_num;
2319
2320         return rc;
2321 }
2322
2323 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2324 {
2325         struct tcp_iter_state *st = seq->private;
2326         void *rc;
2327
2328         if (*pos && *pos == st->last_pos) {
2329                 rc = tcp_seek_last_pos(seq);
2330                 if (rc)
2331                         goto out;
2332         }
2333
2334         st->state = TCP_SEQ_STATE_LISTENING;
2335         st->num = 0;
2336         st->bucket = 0;
2337         st->offset = 0;
2338         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2339
2340 out:
2341         st->last_pos = *pos;
2342         return rc;
2343 }
2344 EXPORT_SYMBOL(tcp_seq_start);
2345
2346 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2347 {
2348         struct tcp_iter_state *st = seq->private;
2349         void *rc = NULL;
2350
2351         if (v == SEQ_START_TOKEN) {
2352                 rc = tcp_get_idx(seq, 0);
2353                 goto out;
2354         }
2355
2356         switch (st->state) {
2357         case TCP_SEQ_STATE_LISTENING:
2358                 rc = listening_get_next(seq, v);
2359                 if (!rc) {
2360                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2361                         st->bucket = 0;
2362                         st->offset = 0;
2363                         rc        = established_get_first(seq);
2364                 }
2365                 break;
2366         case TCP_SEQ_STATE_ESTABLISHED:
2367                 rc = established_get_next(seq, v);
2368                 break;
2369         }
2370 out:
2371         ++*pos;
2372         st->last_pos = *pos;
2373         return rc;
2374 }
2375 EXPORT_SYMBOL(tcp_seq_next);
2376
2377 void tcp_seq_stop(struct seq_file *seq, void *v)
2378 {
2379         struct tcp_iter_state *st = seq->private;
2380
2381         switch (st->state) {
2382         case TCP_SEQ_STATE_LISTENING:
2383                 if (v != SEQ_START_TOKEN)
2384                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2385                 break;
2386         case TCP_SEQ_STATE_ESTABLISHED:
2387                 if (v)
2388                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2389                 break;
2390         }
2391 }
2392 EXPORT_SYMBOL(tcp_seq_stop);
2393
2394 static void get_openreq4(const struct request_sock *req,
2395                          struct seq_file *f, int i)
2396 {
2397         const struct inet_request_sock *ireq = inet_rsk(req);
2398         long delta = req->rsk_timer.expires - jiffies;
2399
2400         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2401                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2402                 i,
2403                 ireq->ir_loc_addr,
2404                 ireq->ir_num,
2405                 ireq->ir_rmt_addr,
2406                 ntohs(ireq->ir_rmt_port),
2407                 TCP_SYN_RECV,
2408                 0, 0, /* could print option size, but that is af dependent. */
2409                 1,    /* timers active (only the expire timer) */
2410                 jiffies_delta_to_clock_t(delta),
2411                 req->num_timeout,
2412                 from_kuid_munged(seq_user_ns(f),
2413                                  sock_i_uid(req->rsk_listener)),
2414                 0,  /* non standard timer */
2415                 0, /* open_requests have no inode */
2416                 0,
2417                 req);
2418 }
2419
2420 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2421 {
2422         int timer_active;
2423         unsigned long timer_expires;
2424         const struct tcp_sock *tp = tcp_sk(sk);
2425         const struct inet_connection_sock *icsk = inet_csk(sk);
2426         const struct inet_sock *inet = inet_sk(sk);
2427         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2428         __be32 dest = inet->inet_daddr;
2429         __be32 src = inet->inet_rcv_saddr;
2430         __u16 destp = ntohs(inet->inet_dport);
2431         __u16 srcp = ntohs(inet->inet_sport);
2432         int rx_queue;
2433         int state;
2434
2435         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2436             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2437             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2438                 timer_active    = 1;
2439                 timer_expires   = icsk->icsk_timeout;
2440         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2441                 timer_active    = 4;
2442                 timer_expires   = icsk->icsk_timeout;
2443         } else if (timer_pending(&sk->sk_timer)) {
2444                 timer_active    = 2;
2445                 timer_expires   = sk->sk_timer.expires;
2446         } else {
2447                 timer_active    = 0;
2448                 timer_expires = jiffies;
2449         }
2450
2451         state = inet_sk_state_load(sk);
2452         if (state == TCP_LISTEN)
2453                 rx_queue = sk->sk_ack_backlog;
2454         else
2455                 /* Because we don't lock the socket,
2456                  * we might find a transient negative value.
2457                  */
2458                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2459
2460         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2461                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2462                 i, src, srcp, dest, destp, state,
2463                 tp->write_seq - tp->snd_una,
2464                 rx_queue,
2465                 timer_active,
2466                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2467                 icsk->icsk_retransmits,
2468                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2469                 icsk->icsk_probes_out,
2470                 sock_i_ino(sk),
2471                 refcount_read(&sk->sk_refcnt), sk,
2472                 jiffies_to_clock_t(icsk->icsk_rto),
2473                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2474                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2475                 tp->snd_cwnd,
2476                 state == TCP_LISTEN ?
2477                     fastopenq->max_qlen :
2478                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2479 }
2480
2481 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2482                                struct seq_file *f, int i)
2483 {
2484         long delta = tw->tw_timer.expires - jiffies;
2485         __be32 dest, src;
2486         __u16 destp, srcp;
2487
2488         dest  = tw->tw_daddr;
2489         src   = tw->tw_rcv_saddr;
2490         destp = ntohs(tw->tw_dport);
2491         srcp  = ntohs(tw->tw_sport);
2492
2493         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2494                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2495                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2496                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2497                 refcount_read(&tw->tw_refcnt), tw);
2498 }
2499
2500 #define TMPSZ 150
2501
2502 static int tcp4_seq_show(struct seq_file *seq, void *v)
2503 {
2504         struct tcp_iter_state *st;
2505         struct sock *sk = v;
2506
2507         seq_setwidth(seq, TMPSZ - 1);
2508         if (v == SEQ_START_TOKEN) {
2509                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2510                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2511                            "inode");
2512                 goto out;
2513         }
2514         st = seq->private;
2515
2516         if (sk->sk_state == TCP_TIME_WAIT)
2517                 get_timewait4_sock(v, seq, st->num);
2518         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2519                 get_openreq4(v, seq, st->num);
2520         else
2521                 get_tcp4_sock(v, seq, st->num);
2522 out:
2523         seq_pad(seq, '\n');
2524         return 0;
2525 }
2526
2527 static const struct seq_operations tcp4_seq_ops = {
2528         .show           = tcp4_seq_show,
2529         .start          = tcp_seq_start,
2530         .next           = tcp_seq_next,
2531         .stop           = tcp_seq_stop,
2532 };
2533
2534 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2535         .family         = AF_INET,
2536 };
2537
2538 static int __net_init tcp4_proc_init_net(struct net *net)
2539 {
2540         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2541                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2542                 return -ENOMEM;
2543         return 0;
2544 }
2545
2546 static void __net_exit tcp4_proc_exit_net(struct net *net)
2547 {
2548         remove_proc_entry("tcp", net->proc_net);
2549 }
2550
2551 static struct pernet_operations tcp4_net_ops = {
2552         .init = tcp4_proc_init_net,
2553         .exit = tcp4_proc_exit_net,
2554 };
2555
2556 int __init tcp4_proc_init(void)
2557 {
2558         return register_pernet_subsys(&tcp4_net_ops);
2559 }
2560
2561 void tcp4_proc_exit(void)
2562 {
2563         unregister_pernet_subsys(&tcp4_net_ops);
2564 }
2565 #endif /* CONFIG_PROC_FS */
2566
2567 struct proto tcp_prot = {
2568         .name                   = "TCP",
2569         .owner                  = THIS_MODULE,
2570         .close                  = tcp_close,
2571         .pre_connect            = tcp_v4_pre_connect,
2572         .connect                = tcp_v4_connect,
2573         .disconnect             = tcp_disconnect,
2574         .accept                 = inet_csk_accept,
2575         .ioctl                  = tcp_ioctl,
2576         .init                   = tcp_v4_init_sock,
2577         .destroy                = tcp_v4_destroy_sock,
2578         .shutdown               = tcp_shutdown,
2579         .setsockopt             = tcp_setsockopt,
2580         .getsockopt             = tcp_getsockopt,
2581         .keepalive              = tcp_set_keepalive,
2582         .recvmsg                = tcp_recvmsg,
2583         .sendmsg                = tcp_sendmsg,
2584         .sendpage               = tcp_sendpage,
2585         .backlog_rcv            = tcp_v4_do_rcv,
2586         .release_cb             = tcp_release_cb,
2587         .hash                   = inet_hash,
2588         .unhash                 = inet_unhash,
2589         .get_port               = inet_csk_get_port,
2590         .enter_memory_pressure  = tcp_enter_memory_pressure,
2591         .leave_memory_pressure  = tcp_leave_memory_pressure,
2592         .stream_memory_free     = tcp_stream_memory_free,
2593         .sockets_allocated      = &tcp_sockets_allocated,
2594         .orphan_count           = &tcp_orphan_count,
2595         .memory_allocated       = &tcp_memory_allocated,
2596         .memory_pressure        = &tcp_memory_pressure,
2597         .sysctl_mem             = sysctl_tcp_mem,
2598         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2599         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2600         .max_header             = MAX_TCP_HEADER,
2601         .obj_size               = sizeof(struct tcp_sock),
2602         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2603         .twsk_prot              = &tcp_timewait_sock_ops,
2604         .rsk_prot               = &tcp_request_sock_ops,
2605         .h.hashinfo             = &tcp_hashinfo,
2606         .no_autobind            = true,
2607 #ifdef CONFIG_COMPAT
2608         .compat_setsockopt      = compat_tcp_setsockopt,
2609         .compat_getsockopt      = compat_tcp_getsockopt,
2610 #endif
2611         .diag_destroy           = tcp_abort,
2612 };
2613 EXPORT_SYMBOL(tcp_prot);
2614
2615 static void __net_exit tcp_sk_exit(struct net *net)
2616 {
2617         int cpu;
2618
2619         if (net->ipv4.tcp_congestion_control)
2620                 module_put(net->ipv4.tcp_congestion_control->owner);
2621
2622         for_each_possible_cpu(cpu)
2623                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2624         free_percpu(net->ipv4.tcp_sk);
2625 }
2626
2627 static int __net_init tcp_sk_init(struct net *net)
2628 {
2629         int res, cpu, cnt;
2630
2631         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2632         if (!net->ipv4.tcp_sk)
2633                 return -ENOMEM;
2634
2635         for_each_possible_cpu(cpu) {
2636                 struct sock *sk;
2637
2638                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2639                                            IPPROTO_TCP, net);
2640                 if (res)
2641                         goto fail;
2642                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2643
2644                 /* Please enforce IP_DF and IPID==0 for RST and
2645                  * ACK sent in SYN-RECV and TIME-WAIT state.
2646                  */
2647                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2648
2649                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2650         }
2651
2652         net->ipv4.sysctl_tcp_ecn = 2;
2653         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2654
2655         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2656         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2657         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2658         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2659         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2660
2661         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2662         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2663         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2664
2665         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2666         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2667         net->ipv4.sysctl_tcp_syncookies = 1;
2668         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2669         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2670         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2671         net->ipv4.sysctl_tcp_orphan_retries = 0;
2672         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2673         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2674         net->ipv4.sysctl_tcp_tw_reuse = 2;
2675
2676         cnt = tcp_hashinfo.ehash_mask + 1;
2677         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2678         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2679
2680         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2681         net->ipv4.sysctl_tcp_sack = 1;
2682         net->ipv4.sysctl_tcp_window_scaling = 1;
2683         net->ipv4.sysctl_tcp_timestamps = 1;
2684         net->ipv4.sysctl_tcp_early_retrans = 3;
2685         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2686         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2687         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2688         net->ipv4.sysctl_tcp_max_reordering = 300;
2689         net->ipv4.sysctl_tcp_dsack = 1;
2690         net->ipv4.sysctl_tcp_app_win = 31;
2691         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2692         net->ipv4.sysctl_tcp_frto = 2;
2693         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2694         /* This limits the percentage of the congestion window which we
2695          * will allow a single TSO frame to consume.  Building TSO frames
2696          * which are too large can cause TCP streams to be bursty.
2697          */
2698         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2699         /* Default TSQ limit of 16 TSO segments */
2700         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2701         /* rfc5961 challenge ack rate limiting */
2702         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2703         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2704         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2705         net->ipv4.sysctl_tcp_autocorking = 1;
2706         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2707         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2708         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2709         if (net != &init_net) {
2710                 memcpy(net->ipv4.sysctl_tcp_rmem,
2711                        init_net.ipv4.sysctl_tcp_rmem,
2712                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2713                 memcpy(net->ipv4.sysctl_tcp_wmem,
2714                        init_net.ipv4.sysctl_tcp_wmem,
2715                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2716         }
2717         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2718         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2719         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2720         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2721         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2722         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2723
2724         /* Reno is always built in */
2725         if (!net_eq(net, &init_net) &&
2726             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2727                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2728         else
2729                 net->ipv4.tcp_congestion_control = &tcp_reno;
2730
2731         return 0;
2732 fail:
2733         tcp_sk_exit(net);
2734
2735         return res;
2736 }
2737
2738 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2739 {
2740         struct net *net;
2741
2742         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2743
2744         list_for_each_entry(net, net_exit_list, exit_list)
2745                 tcp_fastopen_ctx_destroy(net);
2746 }
2747
2748 static struct pernet_operations __net_initdata tcp_sk_ops = {
2749        .init       = tcp_sk_init,
2750        .exit       = tcp_sk_exit,
2751        .exit_batch = tcp_sk_exit_batch,
2752 };
2753
2754 void __init tcp_v4_init(void)
2755 {
2756         if (register_pernet_subsys(&tcp_sk_ops))
2757                 panic("Failed to create the TCP control socket.\n");
2758 }