net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                         if (tp->write_seq == 0)
 169                                 tp->write_seq = 1;
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         tp->write_seq      = 0;
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295                                                        inet->inet_daddr,
 296                                                        inet->inet_sport,
 297                                                        usin->sin_port);
 298                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299                                                  inet->inet_saddr,
 300                                                  inet->inet_daddr);
 301         }
 302
 303         inet->inet_id = tp->write_seq ^ jiffies;
 304
 305         if (tcp_fastopen_defer_connect(sk, &err))
 306                 return err;
 307         if (err)
 308                 goto failure;
 309
 310         err = tcp_connect(sk);
 311
 312         if (err)
 313                 goto failure;
 314
 315         return 0;
 316
 317 failure:
 318         /*
 319          * This unhashes the socket and releases the local port,
 320          * if necessary.
 321          */
 322         tcp_set_state(sk, TCP_CLOSE);
 323         ip_rt_put(rt);
 324         sk->sk_route_caps = 0;
 325         inet->inet_dport = 0;
 326         return err;
 327 }
 328 EXPORT_SYMBOL(tcp_v4_connect);
 329
 330 /*
 331  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332  * It can be called through tcp_release_cb() if socket was owned by user
 333  * at the time tcp_v4_err() was called to handle ICMP message.
 334  */
 335 void tcp_v4_mtu_reduced(struct sock *sk)
 336 {
 337         struct inet_sock *inet = inet_sk(sk);
 338         struct dst_entry *dst;
 339         u32 mtu;
 340
 341         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342                 return;
 343         mtu = tcp_sk(sk)->mtu_info;
 344         dst = inet_csk_update_pmtu(sk, mtu);
 345         if (!dst)
 346                 return;
 347
 348         /* Something is about to be wrong... Remember soft error
 349          * for the case, if this connection will not able to recover.
 350          */
 351         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352                 sk->sk_err_soft = EMSGSIZE;
 353
 354         mtu = dst_mtu(dst);
 355
 356         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357             ip_sk_accept_pmtu(sk) &&
 358             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359                 tcp_sync_mss(sk, mtu);
 360
 361                 /* Resend the TCP packet because it's
 362                  * clear that the old packet has been
 363                  * dropped. This is the new "fast" path mtu
 364                  * discovery.
 365                  */
 366                 tcp_simple_retransmit(sk);
 367         } /* else let the usual retransmit timer handle it */
 368 }
 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372 {
 373         struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375         if (dst)
 376                 dst->ops->redirect(dst, sk, skb);
 377 }
 378
 379
 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382 {
 383         struct request_sock *req = inet_reqsk(sk);
 384         struct net *net = sock_net(sk);
 385
 386         /* ICMPs are not backlogged, hence we cannot get
 387          * an established socket here.
 388          */
 389         if (seq != tcp_rsk(req)->snt_isn) {
 390                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391         } else if (abort) {
 392                 /*
 393                  * Still in SYN_RECV, just remove it silently.
 394                  * There is no good way to pass the error to the newly
 395                  * created socket, and POSIX does not want network
 396                  * errors returned from accept().
 397                  */
 398                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399                 tcp_listendrop(req->rsk_listener);
 400         }
 401         reqsk_put(req);
 402 }
 403 EXPORT_SYMBOL(tcp_req_err);
 404
 405 /*
 406  * This routine is called by the ICMP module when it gets some
 407  * sort of error condition.  If err < 0 then the socket should
 408  * be closed and the error returned to the user.  If err > 0
 409  * it's just the icmp type << 8 | icmp code.  After adjustment
 410  * header points to the first 8 bytes of the tcp header.  We need
 411  * to find the appropriate port.
 412  *
 413  * The locking strategy used here is very "optimistic". When
 414  * someone else accesses the socket the ICMP is just dropped
 415  * and for some paths there is no check at all.
 416  * A more general error queue to queue errors for later handling
 417  * is probably better.
 418  *
 419  */
 420
 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422 {
 423         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425         struct inet_connection_sock *icsk;
 426         struct tcp_sock *tp;
 427         struct inet_sock *inet;
 428         const int type = icmp_hdr(icmp_skb)->type;
 429         const int code = icmp_hdr(icmp_skb)->code;
 430         struct sock *sk;
 431         struct sk_buff *skb;
 432         struct request_sock *fastopen;
 433         u32 seq, snd_una;
 434         s32 remaining;
 435         u32 delta_us;
 436         int err;
 437         struct net *net = dev_net(icmp_skb->dev);
 438
 439         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440                                        th->dest, iph->saddr, ntohs(th->source),
 441                                        inet_iif(icmp_skb), 0);
 442         if (!sk) {
 443                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444                 return -ENOENT;
 445         }
 446         if (sk->sk_state == TCP_TIME_WAIT) {
 447                 inet_twsk_put(inet_twsk(sk));
 448                 return 0;
 449         }
 450         seq = ntohl(th->seq);
 451         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453                                      type == ICMP_TIME_EXCEEDED ||
 454                                      (type == ICMP_DEST_UNREACH &&
 455                                       (code == ICMP_NET_UNREACH ||
 456                                        code == ICMP_HOST_UNREACH)));
 457                 return 0;
 458         }
 459
 460         bh_lock_sock(sk);
 461         /* If too many ICMPs get dropped on busy
 462          * servers this needs to be solved differently.
 463          * We do take care of PMTU discovery (RFC1191) special case :
 464          * we can receive locally generated ICMP messages while socket is held.
 465          */
 466         if (sock_owned_by_user(sk)) {
 467                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469         }
 470         if (sk->sk_state == TCP_CLOSE)
 471                 goto out;
 472
 473         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475                 goto out;
 476         }
 477
 478         icsk = inet_csk(sk);
 479         tp = tcp_sk(sk);
 480         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481         fastopen = tp->fastopen_rsk;
 482         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483         if (sk->sk_state != TCP_LISTEN &&
 484             !between(seq, snd_una, tp->snd_nxt)) {
 485                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486                 goto out;
 487         }
 488
 489         switch (type) {
 490         case ICMP_REDIRECT:
 491                 if (!sock_owned_by_user(sk))
 492                         do_redirect(icmp_skb, sk);
 493                 goto out;
 494         case ICMP_SOURCE_QUENCH:
 495                 /* Just silently ignore these. */
 496                 goto out;
 497         case ICMP_PARAMETERPROB:
 498                 err = EPROTO;
 499                 break;
 500         case ICMP_DEST_UNREACH:
 501                 if (code > NR_ICMP_UNREACH)
 502                         goto out;
 503
 504                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505                         /* We are not interested in TCP_LISTEN and open_requests
 506                          * (SYN-ACKs send out by Linux are always <576bytes so
 507                          * they should go through unfragmented).
 508                          */
 509                         if (sk->sk_state == TCP_LISTEN)
 510                                 goto out;
 511
 512                         tp->mtu_info = info;
 513                         if (!sock_owned_by_user(sk)) {
 514                                 tcp_v4_mtu_reduced(sk);
 515                         } else {
 516                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517                                         sock_hold(sk);
 518                         }
 519                         goto out;
 520                 }
 521
 522                 err = icmp_err_convert[code].errno;
 523                 /* check if icmp_skb allows revert of backoff
 524                  * (see draft-zimmermann-tcp-lcd) */
 525                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526                         break;
 527                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528                     !icsk->icsk_backoff || fastopen)
 529                         break;
 530
 531                 if (sock_owned_by_user(sk))
 532                         break;
 533
 534                 skb = tcp_rtx_queue_head(sk);
 535                 if (WARN_ON_ONCE(!skb))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544                 tcp_mstamp_refresh(tp);
 545                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546                 remaining = icsk->icsk_rto -
 547                             usecs_to_jiffies(delta_us);
 548
 549                 if (remaining > 0) {
 550                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551                                                   remaining, TCP_RTO_MAX);
 552                 } else {
 553                         /* RTO revert clocked out retransmission.
 554                          * Will retransmit now */
 555                         tcp_retransmit_timer(sk);
 556                 }
 557
 558                 break;
 559         case ICMP_TIME_EXCEEDED:
 560                 err = EHOSTUNREACH;
 561                 break;
 562         default:
 563                 goto out;
 564         }
 565
 566         switch (sk->sk_state) {
 567         case TCP_SYN_SENT:
 568         case TCP_SYN_RECV:
 569                 /* Only in fast or simultaneous open. If a fast open socket is
 570                  * is already accepted it is treated as a connected one below.
 571                  */
 572                 if (fastopen && !fastopen->sk)
 573                         break;
 574
 575                 if (!sock_owned_by_user(sk)) {
 576                         sk->sk_err = err;
 577
 578                         sk->sk_error_report(sk);
 579
 580                         tcp_done(sk);
 581                 } else {
 582                         sk->sk_err_soft = err;
 583                 }
 584                 goto out;
 585         }
 586
 587         /* If we've already connected we will keep trying
 588          * until we time out, or the user gives up.
 589          *
 590          * rfc1122 4.2.3.9 allows to consider as hard errors
 591          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592          * but it is obsoleted by pmtu discovery).
 593          *
 594          * Note, that in modern internet, where routing is unreliable
 595          * and in each dark corner broken firewalls sit, sending random
 596          * errors ordered by their masters even this two messages finally lose
 597          * their original sense (even Linux sends invalid PORT_UNREACHs)
 598          *
 599          * Now we are in compliance with RFCs.
 600          *                                                      --ANK (980905)
 601          */
 602
 603         inet = inet_sk(sk);
 604         if (!sock_owned_by_user(sk) && inet->recverr) {
 605                 sk->sk_err = err;
 606                 sk->sk_error_report(sk);
 607         } else  { /* Only an error on timeout */
 608                 sk->sk_err_soft = err;
 609         }
 610
 611 out:
 612         bh_unlock_sock(sk);
 613         sock_put(sk);
 614         return 0;
 615 }
 616
 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618 {
 619         struct tcphdr *th = tcp_hdr(skb);
 620
 621         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622         skb->csum_start = skb_transport_header(skb) - skb->head;
 623         skb->csum_offset = offsetof(struct tcphdr, check);
 624 }
 625
 626 /* This routine computes an IPv4 TCP checksum. */
 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628 {
 629         const struct inet_sock *inet = inet_sk(sk);
 630
 631         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632 }
 633 EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635 /*
 636  *      This routine will send an RST to the other tcp.
 637  *
 638  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639  *                    for reset.
 640  *      Answer: if a packet caused RST, it is not for a socket
 641  *              existing in our system, if it is matched to a socket,
 642  *              it is just duplicate segment or bug in other side's TCP.
 643  *              So that we build reply only basing on parameters
 644  *              arrived with segment.
 645  *      Exception: precedence violation. We do not implement it in any case.
 646  */
 647
 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649 {
 650         const struct tcphdr *th = tcp_hdr(skb);
 651         struct {
 652                 struct tcphdr th;
 653 #ifdef CONFIG_TCP_MD5SIG
 654                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655 #endif
 656         } rep;
 657         struct ip_reply_arg arg;
 658 #ifdef CONFIG_TCP_MD5SIG
 659         struct tcp_md5sig_key *key = NULL;
 660         const __u8 *hash_location = NULL;
 661         unsigned char newhash[16];
 662         int genhash;
 663         struct sock *sk1 = NULL;
 664 #endif
 665         u64 transmit_time = 0;
 666         struct sock *ctl_sk;
 667         struct net *net;
 668
 669         /* Never send a reset in response to a reset. */
 670         if (th->rst)
 671                 return;
 672
 673         /* If sk not NULL, it means we did a successful lookup and incoming
 674          * route had to be correct. prequeue might have dropped our dst.
 675          */
 676         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                 return;
 678
 679         /* Swap the send and the receive. */
 680         memset(&rep, 0, sizeof(rep));
 681         rep.th.dest   = th->source;
 682         rep.th.source = th->dest;
 683         rep.th.doff   = sizeof(struct tcphdr) / 4;
 684         rep.th.rst    = 1;
 685
 686         if (th->ack) {
 687                 rep.th.seq = th->ack_seq;
 688         } else {
 689                 rep.th.ack = 1;
 690                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                        skb->len - (th->doff << 2));
 692         }
 693
 694         memset(&arg, 0, sizeof(arg));
 695         arg.iov[0].iov_base = (unsigned char *)&rep;
 696         arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699 #ifdef CONFIG_TCP_MD5SIG
 700         rcu_read_lock();
 701         hash_location = tcp_parse_md5sig_option(th);
 702         if (sk && sk_fullsock(sk)) {
 703                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                         &ip_hdr(skb)->saddr, AF_INET);
 705         } else if (hash_location) {
 706                 /*
 707                  * active side is lost. Try to find listening socket through
 708                  * source port, and then find md5 key through listening socket.
 709                  * we are not loose security here:
 710                  * Incoming packet is checked with md5 hash with finding key,
 711                  * no RST generated if md5 hash doesn't match.
 712                  */
 713                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                              ip_hdr(skb)->saddr,
 715                                              th->source, ip_hdr(skb)->daddr,
 716                                              ntohs(th->source), inet_iif(skb),
 717                                              tcp_v4_sdif(skb));
 718                 /* don't send rst if it can't find key */
 719                 if (!sk1)
 720                         goto out;
 721
 722                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                         &ip_hdr(skb)->saddr, AF_INET);
 724                 if (!key)
 725                         goto out;
 726
 727
 728                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                         goto out;
 731
 732         }
 733
 734         if (key) {
 735                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                    (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_MD5SIG << 8) |
 738                                    TCPOLEN_MD5SIG);
 739                 /* Update length and the length the header thinks exists */
 740                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                 rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                      key, ip_hdr(skb)->saddr,
 745                                      ip_hdr(skb)->daddr, &rep.th);
 746         }
 747 #endif
 748         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                       ip_hdr(skb)->saddr, /* XXX */
 750                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754         /* When socket is gone, all binding information is lost.
 755          * routing might fail in this case. No choice here, if we choose to force
 756          * input interface, we will misroute in case of asymmetric route.
 757          */
 758         if (sk) {
 759                 arg.bound_dev_if = sk->sk_bound_dev_if;
 760                 if (sk_fullsock(sk))
 761                         trace_tcp_send_reset(sk, skb);
 762         }
 763
 764         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767         arg.tos = ip_hdr(skb)->tos;
 768         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769         local_bh_disable();
 770         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 771         if (sk) {
 772                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 774                 transmit_time = tcp_transmit_time(sk);
 775         }
 776         ip_send_unicast_reply(ctl_sk,
 777                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 778                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 779                               &arg, arg.iov[0].iov_len,
 780                               transmit_time);
 781
 782         ctl_sk->sk_mark = 0;
 783         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 784         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 785         local_bh_enable();
 786
 787 #ifdef CONFIG_TCP_MD5SIG
 788 out:
 789         rcu_read_unlock();
 790 #endif
 791 }
 792
 793 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 794    outside socket context is ugly, certainly. What can I do?
 795  */
 796
 797 static void tcp_v4_send_ack(const struct sock *sk,
 798                             struct sk_buff *skb, u32 seq, u32 ack,
 799                             u32 win, u32 tsval, u32 tsecr, int oif,
 800                             struct tcp_md5sig_key *key,
 801                             int reply_flags, u8 tos)
 802 {
 803         const struct tcphdr *th = tcp_hdr(skb);
 804         struct {
 805                 struct tcphdr th;
 806                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 807 #ifdef CONFIG_TCP_MD5SIG
 808                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 809 #endif
 810                         ];
 811         } rep;
 812         struct net *net = sock_net(sk);
 813         struct ip_reply_arg arg;
 814         struct sock *ctl_sk;
 815         u64 transmit_time;
 816
 817         memset(&rep.th, 0, sizeof(struct tcphdr));
 818         memset(&arg, 0, sizeof(arg));
 819
 820         arg.iov[0].iov_base = (unsigned char *)&rep;
 821         arg.iov[0].iov_len  = sizeof(rep.th);
 822         if (tsecr) {
 823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 824                                    (TCPOPT_TIMESTAMP << 8) |
 825                                    TCPOLEN_TIMESTAMP);
 826                 rep.opt[1] = htonl(tsval);
 827                 rep.opt[2] = htonl(tsecr);
 828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 829         }
 830
 831         /* Swap the send and the receive. */
 832         rep.th.dest    = th->source;
 833         rep.th.source  = th->dest;
 834         rep.th.doff    = arg.iov[0].iov_len / 4;
 835         rep.th.seq     = htonl(seq);
 836         rep.th.ack_seq = htonl(ack);
 837         rep.th.ack     = 1;
 838         rep.th.window  = htons(win);
 839
 840 #ifdef CONFIG_TCP_MD5SIG
 841         if (key) {
 842                 int offset = (tsecr) ? 3 : 0;
 843
 844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 845                                           (TCPOPT_NOP << 16) |
 846                                           (TCPOPT_MD5SIG << 8) |
 847                                           TCPOLEN_MD5SIG);
 848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 849                 rep.th.doff = arg.iov[0].iov_len/4;
 850
 851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 852                                     key, ip_hdr(skb)->saddr,
 853                                     ip_hdr(skb)->daddr, &rep.th);
 854         }
 855 #endif
 856         arg.flags = reply_flags;
 857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 858                                       ip_hdr(skb)->saddr, /* XXX */
 859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 861         if (oif)
 862                 arg.bound_dev_if = oif;
 863         arg.tos = tos;
 864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 865         local_bh_disable();
 866         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 867         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 868                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 869         transmit_time = tcp_transmit_time(sk);
 870         ip_send_unicast_reply(ctl_sk,
 871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 873                               &arg, arg.iov[0].iov_len,
 874                               transmit_time);
 875
 876         ctl_sk->sk_mark = 0;
 877         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 878         local_bh_enable();
 879 }
 880
 881 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 882 {
 883         struct inet_timewait_sock *tw = inet_twsk(sk);
 884         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 885
 886         tcp_v4_send_ack(sk, skb,
 887                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 888                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 889                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 890                         tcptw->tw_ts_recent,
 891                         tw->tw_bound_dev_if,
 892                         tcp_twsk_md5_key(tcptw),
 893                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 894                         tw->tw_tos
 895                         );
 896
 897         inet_twsk_put(tw);
 898 }
 899
 900 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 901                                   struct request_sock *req)
 902 {
 903         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 904          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 905          */
 906         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 907                                              tcp_sk(sk)->snd_nxt;
 908
 909         /* RFC 7323 2.3
 910          * The window field (SEG.WND) of every outgoing segment, with the
 911          * exception of <SYN> segments, MUST be right-shifted by
 912          * Rcv.Wind.Shift bits:
 913          */
 914         tcp_v4_send_ack(sk, skb, seq,
 915                         tcp_rsk(req)->rcv_nxt,
 916                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 917                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 918                         req->ts_recent,
 919                         0,
 920                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 921                                           AF_INET),
 922                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         ip_hdr(skb)->tos);
 924 }
 925
 926 /*
 927  *      Send a SYN-ACK after having received a SYN.
 928  *      This still operates on a request_sock only, not on a big
 929  *      socket.
 930  */
 931 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 932                               struct flowi *fl,
 933                               struct request_sock *req,
 934                               struct tcp_fastopen_cookie *foc,
 935                               enum tcp_synack_type synack_type)
 936 {
 937         const struct inet_request_sock *ireq = inet_rsk(req);
 938         struct flowi4 fl4;
 939         int err = -1;
 940         struct sk_buff *skb;
 941
 942         /* First, grab a route. */
 943         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 944                 return -1;
 945
 946         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 947
 948         if (skb) {
 949                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 950
 951                 rcu_read_lock();
 952                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 953                                             ireq->ir_rmt_addr,
 954                                             rcu_dereference(ireq->ireq_opt));
 955                 rcu_read_unlock();
 956                 err = net_xmit_eval(err);
 957         }
 958
 959         return err;
 960 }
 961
 962 /*
 963  *      IPv4 request_sock destructor.
 964  */
 965 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 966 {
 967         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 968 }
 969
 970 #ifdef CONFIG_TCP_MD5SIG
 971 /*
 972  * RFC2385 MD5 checksumming requires a mapping of
 973  * IP address->MD5 Key.
 974  * We need to maintain these in the sk structure.
 975  */
 976
 977 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 978 EXPORT_SYMBOL(tcp_md5_needed);
 979
 980 /* Find the Key structure for an address.  */
 981 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 982                                            const union tcp_md5_addr *addr,
 983                                            int family)
 984 {
 985         const struct tcp_sock *tp = tcp_sk(sk);
 986         struct tcp_md5sig_key *key;
 987         const struct tcp_md5sig_info *md5sig;
 988         __be32 mask;
 989         struct tcp_md5sig_key *best_match = NULL;
 990         bool match;
 991
 992         /* caller either holds rcu_read_lock() or socket lock */
 993         md5sig = rcu_dereference_check(tp->md5sig_info,
 994                                        lockdep_sock_is_held(sk));
 995         if (!md5sig)
 996                 return NULL;
 997
 998         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 999                 if (key->family != family)
1000                         continue;
1001
1002                 if (family == AF_INET) {
1003                         mask = inet_make_mask(key->prefixlen);
1004                         match = (key->addr.a4.s_addr & mask) ==
1005                                 (addr->a4.s_addr & mask);
1006 #if IS_ENABLED(CONFIG_IPV6)
1007                 } else if (family == AF_INET6) {
1008                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1009                                                   key->prefixlen);
1010 #endif
1011                 } else {
1012                         match = false;
1013                 }
1014
1015                 if (match && (!best_match ||
1016                               key->prefixlen > best_match->prefixlen))
1017                         best_match = key;
1018         }
1019         return best_match;
1020 }
1021 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1022
1023 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1024                                                       const union tcp_md5_addr *addr,
1025                                                       int family, u8 prefixlen)
1026 {
1027         const struct tcp_sock *tp = tcp_sk(sk);
1028         struct tcp_md5sig_key *key;
1029         unsigned int size = sizeof(struct in_addr);
1030         const struct tcp_md5sig_info *md5sig;
1031
1032         /* caller either holds rcu_read_lock() or socket lock */
1033         md5sig = rcu_dereference_check(tp->md5sig_info,
1034                                        lockdep_sock_is_held(sk));
1035         if (!md5sig)
1036                 return NULL;
1037 #if IS_ENABLED(CONFIG_IPV6)
1038         if (family == AF_INET6)
1039                 size = sizeof(struct in6_addr);
1040 #endif
1041         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1042                 if (key->family != family)
1043                         continue;
1044                 if (!memcmp(&key->addr, addr, size) &&
1045                     key->prefixlen == prefixlen)
1046                         return key;
1047         }
1048         return NULL;
1049 }
1050
1051 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1052                                          const struct sock *addr_sk)
1053 {
1054         const union tcp_md5_addr *addr;
1055
1056         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1057         return tcp_md5_do_lookup(sk, addr, AF_INET);
1058 }
1059 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1060
1061 /* This can be called on a newly created socket, from other files */
1062 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1063                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1064                    gfp_t gfp)
1065 {
1066         /* Add Key to the list */
1067         struct tcp_md5sig_key *key;
1068         struct tcp_sock *tp = tcp_sk(sk);
1069         struct tcp_md5sig_info *md5sig;
1070
1071         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1072         if (key) {
1073                 /* Pre-existing entry - just update that one. */
1074                 memcpy(key->key, newkey, newkeylen);
1075                 key->keylen = newkeylen;
1076                 return 0;
1077         }
1078
1079         md5sig = rcu_dereference_protected(tp->md5sig_info,
1080                                            lockdep_sock_is_held(sk));
1081         if (!md5sig) {
1082                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1083                 if (!md5sig)
1084                         return -ENOMEM;
1085
1086                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1087                 INIT_HLIST_HEAD(&md5sig->head);
1088                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1089         }
1090
1091         key = sock_kmalloc(sk, sizeof(*key), gfp);
1092         if (!key)
1093                 return -ENOMEM;
1094         if (!tcp_alloc_md5sig_pool()) {
1095                 sock_kfree_s(sk, key, sizeof(*key));
1096                 return -ENOMEM;
1097         }
1098
1099         memcpy(key->key, newkey, newkeylen);
1100         key->keylen = newkeylen;
1101         key->family = family;
1102         key->prefixlen = prefixlen;
1103         memcpy(&key->addr, addr,
1104                (family == AF_INET6) ? sizeof(struct in6_addr) :
1105                                       sizeof(struct in_addr));
1106         hlist_add_head_rcu(&key->node, &md5sig->head);
1107         return 0;
1108 }
1109 EXPORT_SYMBOL(tcp_md5_do_add);
1110
1111 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1112                    u8 prefixlen)
1113 {
1114         struct tcp_md5sig_key *key;
1115
1116         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1117         if (!key)
1118                 return -ENOENT;
1119         hlist_del_rcu(&key->node);
1120         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1121         kfree_rcu(key, rcu);
1122         return 0;
1123 }
1124 EXPORT_SYMBOL(tcp_md5_do_del);
1125
1126 static void tcp_clear_md5_list(struct sock *sk)
1127 {
1128         struct tcp_sock *tp = tcp_sk(sk);
1129         struct tcp_md5sig_key *key;
1130         struct hlist_node *n;
1131         struct tcp_md5sig_info *md5sig;
1132
1133         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1134
1135         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1136                 hlist_del_rcu(&key->node);
1137                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1138                 kfree_rcu(key, rcu);
1139         }
1140 }
1141
1142 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1143                                  char __user *optval, int optlen)
1144 {
1145         struct tcp_md5sig cmd;
1146         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1147         u8 prefixlen = 32;
1148
1149         if (optlen < sizeof(cmd))
1150                 return -EINVAL;
1151
1152         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1153                 return -EFAULT;
1154
1155         if (sin->sin_family != AF_INET)
1156                 return -EINVAL;
1157
1158         if (optname == TCP_MD5SIG_EXT &&
1159             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1160                 prefixlen = cmd.tcpm_prefixlen;
1161                 if (prefixlen > 32)
1162                         return -EINVAL;
1163         }
1164
1165         if (!cmd.tcpm_keylen)
1166                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167                                       AF_INET, prefixlen);
1168
1169         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1170                 return -EINVAL;
1171
1172         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1173                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1174                               GFP_KERNEL);
1175 }
1176
1177 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1178                                    __be32 daddr, __be32 saddr,
1179                                    const struct tcphdr *th, int nbytes)
1180 {
1181         struct tcp4_pseudohdr *bp;
1182         struct scatterlist sg;
1183         struct tcphdr *_th;
1184
1185         bp = hp->scratch;
1186         bp->saddr = saddr;
1187         bp->daddr = daddr;
1188         bp->pad = 0;
1189         bp->protocol = IPPROTO_TCP;
1190         bp->len = cpu_to_be16(nbytes);
1191
1192         _th = (struct tcphdr *)(bp + 1);
1193         memcpy(_th, th, sizeof(*th));
1194         _th->check = 0;
1195
1196         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1197         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1198                                 sizeof(*bp) + sizeof(*th));
1199         return crypto_ahash_update(hp->md5_req);
1200 }
1201
1202 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1203                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1204 {
1205         struct tcp_md5sig_pool *hp;
1206         struct ahash_request *req;
1207
1208         hp = tcp_get_md5sig_pool();
1209         if (!hp)
1210                 goto clear_hash_noput;
1211         req = hp->md5_req;
1212
1213         if (crypto_ahash_init(req))
1214                 goto clear_hash;
1215         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1216                 goto clear_hash;
1217         if (tcp_md5_hash_key(hp, key))
1218                 goto clear_hash;
1219         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1220         if (crypto_ahash_final(req))
1221                 goto clear_hash;
1222
1223         tcp_put_md5sig_pool();
1224         return 0;
1225
1226 clear_hash:
1227         tcp_put_md5sig_pool();
1228 clear_hash_noput:
1229         memset(md5_hash, 0, 16);
1230         return 1;
1231 }
1232
1233 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1234                         const struct sock *sk,
1235                         const struct sk_buff *skb)
1236 {
1237         struct tcp_md5sig_pool *hp;
1238         struct ahash_request *req;
1239         const struct tcphdr *th = tcp_hdr(skb);
1240         __be32 saddr, daddr;
1241
1242         if (sk) { /* valid for establish/request sockets */
1243                 saddr = sk->sk_rcv_saddr;
1244                 daddr = sk->sk_daddr;
1245         } else {
1246                 const struct iphdr *iph = ip_hdr(skb);
1247                 saddr = iph->saddr;
1248                 daddr = iph->daddr;
1249         }
1250
1251         hp = tcp_get_md5sig_pool();
1252         if (!hp)
1253                 goto clear_hash_noput;
1254         req = hp->md5_req;
1255
1256         if (crypto_ahash_init(req))
1257                 goto clear_hash;
1258
1259         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1260                 goto clear_hash;
1261         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1262                 goto clear_hash;
1263         if (tcp_md5_hash_key(hp, key))
1264                 goto clear_hash;
1265         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1266         if (crypto_ahash_final(req))
1267                 goto clear_hash;
1268
1269         tcp_put_md5sig_pool();
1270         return 0;
1271
1272 clear_hash:
1273         tcp_put_md5sig_pool();
1274 clear_hash_noput:
1275         memset(md5_hash, 0, 16);
1276         return 1;
1277 }
1278 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1279
1280 #endif
1281
1282 /* Called with rcu_read_lock() */
1283 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1284                                     const struct sk_buff *skb)
1285 {
1286 #ifdef CONFIG_TCP_MD5SIG
1287         /*
1288          * This gets called for each TCP segment that arrives
1289          * so we want to be efficient.
1290          * We have 3 drop cases:
1291          * o No MD5 hash and one expected.
1292          * o MD5 hash and we're not expecting one.
1293          * o MD5 hash and its wrong.
1294          */
1295         const __u8 *hash_location = NULL;
1296         struct tcp_md5sig_key *hash_expected;
1297         const struct iphdr *iph = ip_hdr(skb);
1298         const struct tcphdr *th = tcp_hdr(skb);
1299         int genhash;
1300         unsigned char newhash[16];
1301
1302         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1303                                           AF_INET);
1304         hash_location = tcp_parse_md5sig_option(th);
1305
1306         /* We've parsed the options - do we have a hash? */
1307         if (!hash_expected && !hash_location)
1308                 return false;
1309
1310         if (hash_expected && !hash_location) {
1311                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1312                 return true;
1313         }
1314
1315         if (!hash_expected && hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1317                 return true;
1318         }
1319
1320         /* Okay, so this is hash_expected and hash_location -
1321          * so we need to calculate the checksum.
1322          */
1323         genhash = tcp_v4_md5_hash_skb(newhash,
1324                                       hash_expected,
1325                                       NULL, skb);
1326
1327         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1328                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1329                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1330                                      &iph->saddr, ntohs(th->source),
1331                                      &iph->daddr, ntohs(th->dest),
1332                                      genhash ? " tcp_v4_calc_md5_hash failed"
1333                                      : "");
1334                 return true;
1335         }
1336         return false;
1337 #endif
1338         return false;
1339 }
1340
1341 static void tcp_v4_init_req(struct request_sock *req,
1342                             const struct sock *sk_listener,
1343                             struct sk_buff *skb)
1344 {
1345         struct inet_request_sock *ireq = inet_rsk(req);
1346         struct net *net = sock_net(sk_listener);
1347
1348         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1349         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1350         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1351 }
1352
1353 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1354                                           struct flowi *fl,
1355                                           const struct request_sock *req)
1356 {
1357         return inet_csk_route_req(sk, &fl->u.ip4, req);
1358 }
1359
1360 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1361         .family         =       PF_INET,
1362         .obj_size       =       sizeof(struct tcp_request_sock),
1363         .rtx_syn_ack    =       tcp_rtx_synack,
1364         .send_ack       =       tcp_v4_reqsk_send_ack,
1365         .destructor     =       tcp_v4_reqsk_destructor,
1366         .send_reset     =       tcp_v4_send_reset,
1367         .syn_ack_timeout =      tcp_syn_ack_timeout,
1368 };
1369
1370 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1371         .mss_clamp      =       TCP_MSS_DEFAULT,
1372 #ifdef CONFIG_TCP_MD5SIG
1373         .req_md5_lookup =       tcp_v4_md5_lookup,
1374         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1375 #endif
1376         .init_req       =       tcp_v4_init_req,
1377 #ifdef CONFIG_SYN_COOKIES
1378         .cookie_init_seq =      cookie_v4_init_sequence,
1379 #endif
1380         .route_req      =       tcp_v4_route_req,
1381         .init_seq       =       tcp_v4_init_seq,
1382         .init_ts_off    =       tcp_v4_init_ts_off,
1383         .send_synack    =       tcp_v4_send_synack,
1384 };
1385
1386 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1387 {
1388         /* Never answer to SYNs send to broadcast or multicast */
1389         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1390                 goto drop;
1391
1392         return tcp_conn_request(&tcp_request_sock_ops,
1393                                 &tcp_request_sock_ipv4_ops, sk, skb);
1394
1395 drop:
1396         tcp_listendrop(sk);
1397         return 0;
1398 }
1399 EXPORT_SYMBOL(tcp_v4_conn_request);
1400
1401
1402 /*
1403  * The three way handshake has completed - we got a valid synack -
1404  * now create the new socket.
1405  */
1406 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1407                                   struct request_sock *req,
1408                                   struct dst_entry *dst,
1409                                   struct request_sock *req_unhash,
1410                                   bool *own_req)
1411 {
1412         struct inet_request_sock *ireq;
1413         struct inet_sock *newinet;
1414         struct tcp_sock *newtp;
1415         struct sock *newsk;
1416 #ifdef CONFIG_TCP_MD5SIG
1417         struct tcp_md5sig_key *key;
1418 #endif
1419         struct ip_options_rcu *inet_opt;
1420
1421         if (sk_acceptq_is_full(sk))
1422                 goto exit_overflow;
1423
1424         newsk = tcp_create_openreq_child(sk, req, skb);
1425         if (!newsk)
1426                 goto exit_nonewsk;
1427
1428         newsk->sk_gso_type = SKB_GSO_TCPV4;
1429         inet_sk_rx_dst_set(newsk, skb);
1430
1431         newtp                 = tcp_sk(newsk);
1432         newinet               = inet_sk(newsk);
1433         ireq                  = inet_rsk(req);
1434         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1435         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1436         newsk->sk_bound_dev_if = ireq->ir_iif;
1437         newinet->inet_saddr   = ireq->ir_loc_addr;
1438         inet_opt              = rcu_dereference(ireq->ireq_opt);
1439         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1440         newinet->mc_index     = inet_iif(skb);
1441         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1442         newinet->rcv_tos      = ip_hdr(skb)->tos;
1443         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444         if (inet_opt)
1445                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446         newinet->inet_id = newtp->write_seq ^ jiffies;
1447
1448         if (!dst) {
1449                 dst = inet_csk_route_child_sock(sk, newsk, req);
1450                 if (!dst)
1451                         goto put_and_exit;
1452         } else {
1453                 /* syncookie case : see end of cookie_v4_check() */
1454         }
1455         sk_setup_caps(newsk, dst);
1456
1457         tcp_ca_openreq_child(newsk, dst);
1458
1459         tcp_sync_mss(newsk, dst_mtu(dst));
1460         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1461
1462         tcp_initialize_rcv_mss(newsk);
1463
1464 #ifdef CONFIG_TCP_MD5SIG
1465         /* Copy over the MD5 key from the original socket */
1466         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1467                                 AF_INET);
1468         if (key) {
1469                 /*
1470                  * We're using one, so create a matching key
1471                  * on the newsk structure. If we fail to get
1472                  * memory, then we end up not copying the key
1473                  * across. Shucks.
1474                  */
1475                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1476                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1477                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1478         }
1479 #endif
1480
1481         if (__inet_inherit_port(sk, newsk) < 0)
1482                 goto put_and_exit;
1483         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1484         if (likely(*own_req)) {
1485                 tcp_move_syn(newtp, req);
1486                 ireq->ireq_opt = NULL;
1487         } else {
1488                 newinet->inet_opt = NULL;
1489         }
1490         return newsk;
1491
1492 exit_overflow:
1493         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1494 exit_nonewsk:
1495         dst_release(dst);
1496 exit:
1497         tcp_listendrop(sk);
1498         return NULL;
1499 put_and_exit:
1500         newinet->inet_opt = NULL;
1501         inet_csk_prepare_forced_close(newsk);
1502         tcp_done(newsk);
1503         goto exit;
1504 }
1505 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1506
1507 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1508 {
1509 #ifdef CONFIG_SYN_COOKIES
1510         const struct tcphdr *th = tcp_hdr(skb);
1511
1512         if (!th->syn)
1513                 sk = cookie_v4_check(sk, skb);
1514 #endif
1515         return sk;
1516 }
1517
1518 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1519                          struct tcphdr *th, u32 *cookie)
1520 {
1521         u16 mss = 0;
1522 #ifdef CONFIG_SYN_COOKIES
1523         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1524                                     &tcp_request_sock_ipv4_ops, sk, th);
1525         if (mss) {
1526                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1527                 tcp_synq_overflow(sk);
1528         }
1529 #endif
1530         return mss;
1531 }
1532
1533 /* The socket must have it's spinlock held when we get
1534  * here, unless it is a TCP_LISTEN socket.
1535  *
1536  * We have a potential double-lock case here, so even when
1537  * doing backlog processing we use the BH locking scheme.
1538  * This is because we cannot sleep with the original spinlock
1539  * held.
1540  */
1541 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1542 {
1543         struct sock *rsk;
1544
1545         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1546                 struct dst_entry *dst = sk->sk_rx_dst;
1547
1548                 sock_rps_save_rxhash(sk, skb);
1549                 sk_mark_napi_id(sk, skb);
1550                 if (dst) {
1551                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1552                             !dst->ops->check(dst, 0)) {
1553                                 dst_release(dst);
1554                                 sk->sk_rx_dst = NULL;
1555                         }
1556                 }
1557                 tcp_rcv_established(sk, skb);
1558                 return 0;
1559         }
1560
1561         if (tcp_checksum_complete(skb))
1562                 goto csum_err;
1563
1564         if (sk->sk_state == TCP_LISTEN) {
1565                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1566
1567                 if (!nsk)
1568                         goto discard;
1569                 if (nsk != sk) {
1570                         if (tcp_child_process(sk, nsk, skb)) {
1571                                 rsk = nsk;
1572                                 goto reset;
1573                         }
1574                         return 0;
1575                 }
1576         } else
1577                 sock_rps_save_rxhash(sk, skb);
1578
1579         if (tcp_rcv_state_process(sk, skb)) {
1580                 rsk = sk;
1581                 goto reset;
1582         }
1583         return 0;
1584
1585 reset:
1586         tcp_v4_send_reset(rsk, skb);
1587 discard:
1588         kfree_skb(skb);
1589         /* Be careful here. If this function gets more complicated and
1590          * gcc suffers from register pressure on the x86, sk (in %ebx)
1591          * might be destroyed here. This current version compiles correctly,
1592          * but you have been warned.
1593          */
1594         return 0;
1595
1596 csum_err:
1597         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1598         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1599         goto discard;
1600 }
1601 EXPORT_SYMBOL(tcp_v4_do_rcv);
1602
1603 int tcp_v4_early_demux(struct sk_buff *skb)
1604 {
1605         const struct iphdr *iph;
1606         const struct tcphdr *th;
1607         struct sock *sk;
1608
1609         if (skb->pkt_type != PACKET_HOST)
1610                 return 0;
1611
1612         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1613                 return 0;
1614
1615         iph = ip_hdr(skb);
1616         th = tcp_hdr(skb);
1617
1618         if (th->doff < sizeof(struct tcphdr) / 4)
1619                 return 0;
1620
1621         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1622                                        iph->saddr, th->source,
1623                                        iph->daddr, ntohs(th->dest),
1624                                        skb->skb_iif, inet_sdif(skb));
1625         if (sk) {
1626                 skb->sk = sk;
1627                 skb->destructor = sock_edemux;
1628                 if (sk_fullsock(sk)) {
1629                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1630
1631                         if (dst)
1632                                 dst = dst_check(dst, 0);
1633                         if (dst &&
1634                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1635                                 skb_dst_set_noref(skb, dst);
1636                 }
1637         }
1638         return 0;
1639 }
1640
1641 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1642 {
1643         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1644         struct skb_shared_info *shinfo;
1645         const struct tcphdr *th;
1646         struct tcphdr *thtail;
1647         struct sk_buff *tail;
1648         unsigned int hdrlen;
1649         bool fragstolen;
1650         u32 gso_segs;
1651         int delta;
1652
1653         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1654          * we can fix skb->truesize to its real value to avoid future drops.
1655          * This is valid because skb is not yet charged to the socket.
1656          * It has been noticed pure SACK packets were sometimes dropped
1657          * (if cooked by drivers without copybreak feature).
1658          */
1659         skb_condense(skb);
1660
1661         skb_dst_drop(skb);
1662
1663         if (unlikely(tcp_checksum_complete(skb))) {
1664                 bh_unlock_sock(sk);
1665                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1666                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1667                 return true;
1668         }
1669
1670         /* Attempt coalescing to last skb in backlog, even if we are
1671          * above the limits.
1672          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1673          */
1674         th = (const struct tcphdr *)skb->data;
1675         hdrlen = th->doff * 4;
1676         shinfo = skb_shinfo(skb);
1677
1678         if (!shinfo->gso_size)
1679                 shinfo->gso_size = skb->len - hdrlen;
1680
1681         if (!shinfo->gso_segs)
1682                 shinfo->gso_segs = 1;
1683
1684         tail = sk->sk_backlog.tail;
1685         if (!tail)
1686                 goto no_coalesce;
1687         thtail = (struct tcphdr *)tail->data;
1688
1689         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1690             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1691             ((TCP_SKB_CB(tail)->tcp_flags |
1692               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1693             !((TCP_SKB_CB(tail)->tcp_flags &
1694               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1695             ((TCP_SKB_CB(tail)->tcp_flags ^
1696               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1697 #ifdef CONFIG_TLS_DEVICE
1698             tail->decrypted != skb->decrypted ||
1699 #endif
1700             thtail->doff != th->doff ||
1701             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1702                 goto no_coalesce;
1703
1704         __skb_pull(skb, hdrlen);
1705         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1706                 thtail->window = th->window;
1707
1708                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1709
1710                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1711                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1712
1713                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1714                  * thtail->fin, so that the fast path in tcp_rcv_established()
1715                  * is not entered if we append a packet with a FIN.
1716                  * SYN, RST, URG are not present.
1717                  * ACK is set on both packets.
1718                  * PSH : we do not really care in TCP stack,
1719                  *       at least for 'GRO' packets.
1720                  */
1721                 thtail->fin |= th->fin;
1722                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1723
1724                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1725                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1726                         tail->tstamp = skb->tstamp;
1727                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1728                 }
1729
1730                 /* Not as strict as GRO. We only need to carry mss max value */
1731                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1732                                                  skb_shinfo(tail)->gso_size);
1733
1734                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1735                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1736
1737                 sk->sk_backlog.len += delta;
1738                 __NET_INC_STATS(sock_net(sk),
1739                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1740                 kfree_skb_partial(skb, fragstolen);
1741                 return false;
1742         }
1743         __skb_push(skb, hdrlen);
1744
1745 no_coalesce:
1746         /* Only socket owner can try to collapse/prune rx queues
1747          * to reduce memory overhead, so add a little headroom here.
1748          * Few sockets backlog are possibly concurrently non empty.
1749          */
1750         limit += 64*1024;
1751
1752         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1753                 bh_unlock_sock(sk);
1754                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1755                 return true;
1756         }
1757         return false;
1758 }
1759 EXPORT_SYMBOL(tcp_add_backlog);
1760
1761 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1762 {
1763         struct tcphdr *th = (struct tcphdr *)skb->data;
1764
1765         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1766 }
1767 EXPORT_SYMBOL(tcp_filter);
1768
1769 static void tcp_v4_restore_cb(struct sk_buff *skb)
1770 {
1771         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1772                 sizeof(struct inet_skb_parm));
1773 }
1774
1775 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1776                            const struct tcphdr *th)
1777 {
1778         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1779          * barrier() makes sure compiler wont play fool^Waliasing games.
1780          */
1781         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1782                 sizeof(struct inet_skb_parm));
1783         barrier();
1784
1785         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1786         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1787                                     skb->len - th->doff * 4);
1788         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1789         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1790         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1791         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1792         TCP_SKB_CB(skb)->sacked  = 0;
1793         TCP_SKB_CB(skb)->has_rxtstamp =
1794                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1795 }
1796
1797 /*
1798  *      From tcp_input.c
1799  */
1800
1801 int tcp_v4_rcv(struct sk_buff *skb)
1802 {
1803         struct net *net = dev_net(skb->dev);
1804         struct sk_buff *skb_to_free;
1805         int sdif = inet_sdif(skb);
1806         const struct iphdr *iph;
1807         const struct tcphdr *th;
1808         bool refcounted;
1809         struct sock *sk;
1810         int ret;
1811
1812         if (skb->pkt_type != PACKET_HOST)
1813                 goto discard_it;
1814
1815         /* Count it even if it's bad */
1816         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1817
1818         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1819                 goto discard_it;
1820
1821         th = (const struct tcphdr *)skb->data;
1822
1823         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1824                 goto bad_packet;
1825         if (!pskb_may_pull(skb, th->doff * 4))
1826                 goto discard_it;
1827
1828         /* An explanation is required here, I think.
1829          * Packet length and doff are validated by header prediction,
1830          * provided case of th->doff==0 is eliminated.
1831          * So, we defer the checks. */
1832
1833         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1834                 goto csum_error;
1835
1836         th = (const struct tcphdr *)skb->data;
1837         iph = ip_hdr(skb);
1838 lookup:
1839         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1840                                th->dest, sdif, &refcounted);
1841         if (!sk)
1842                 goto no_tcp_socket;
1843
1844 process:
1845         if (sk->sk_state == TCP_TIME_WAIT)
1846                 goto do_time_wait;
1847
1848         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1849                 struct request_sock *req = inet_reqsk(sk);
1850                 bool req_stolen = false;
1851                 struct sock *nsk;
1852
1853                 sk = req->rsk_listener;
1854                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1855                         sk_drops_add(sk, skb);
1856                         reqsk_put(req);
1857                         goto discard_it;
1858                 }
1859                 if (tcp_checksum_complete(skb)) {
1860                         reqsk_put(req);
1861                         goto csum_error;
1862                 }
1863                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1864                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1865                         goto lookup;
1866                 }
1867                 /* We own a reference on the listener, increase it again
1868                  * as we might lose it too soon.
1869                  */
1870                 sock_hold(sk);
1871                 refcounted = true;
1872                 nsk = NULL;
1873                 if (!tcp_filter(sk, skb)) {
1874                         th = (const struct tcphdr *)skb->data;
1875                         iph = ip_hdr(skb);
1876                         tcp_v4_fill_cb(skb, iph, th);
1877                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1878                 }
1879                 if (!nsk) {
1880                         reqsk_put(req);
1881                         if (req_stolen) {
1882                                 /* Another cpu got exclusive access to req
1883                                  * and created a full blown socket.
1884                                  * Try to feed this packet to this socket
1885                                  * instead of discarding it.
1886                                  */
1887                                 tcp_v4_restore_cb(skb);
1888                                 sock_put(sk);
1889                                 goto lookup;
1890                         }
1891                         goto discard_and_relse;
1892                 }
1893                 if (nsk == sk) {
1894                         reqsk_put(req);
1895                         tcp_v4_restore_cb(skb);
1896                 } else if (tcp_child_process(sk, nsk, skb)) {
1897                         tcp_v4_send_reset(nsk, skb);
1898                         goto discard_and_relse;
1899                 } else {
1900                         sock_put(sk);
1901                         return 0;
1902                 }
1903         }
1904         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1905                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1906                 goto discard_and_relse;
1907         }
1908
1909         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1910                 goto discard_and_relse;
1911
1912         if (tcp_v4_inbound_md5_hash(sk, skb))
1913                 goto discard_and_relse;
1914
1915         nf_reset(skb);
1916
1917         if (tcp_filter(sk, skb))
1918                 goto discard_and_relse;
1919         th = (const struct tcphdr *)skb->data;
1920         iph = ip_hdr(skb);
1921         tcp_v4_fill_cb(skb, iph, th);
1922
1923         skb->dev = NULL;
1924
1925         if (sk->sk_state == TCP_LISTEN) {
1926                 ret = tcp_v4_do_rcv(sk, skb);
1927                 goto put_and_return;
1928         }
1929
1930         sk_incoming_cpu_update(sk);
1931
1932         bh_lock_sock_nested(sk);
1933         tcp_segs_in(tcp_sk(sk), skb);
1934         ret = 0;
1935         if (!sock_owned_by_user(sk)) {
1936                 skb_to_free = sk->sk_rx_skb_cache;
1937                 sk->sk_rx_skb_cache = NULL;
1938                 ret = tcp_v4_do_rcv(sk, skb);
1939         } else {
1940                 if (tcp_add_backlog(sk, skb))
1941                         goto discard_and_relse;
1942                 skb_to_free = NULL;
1943         }
1944         bh_unlock_sock(sk);
1945         if (skb_to_free)
1946                 __kfree_skb(skb_to_free);
1947
1948 put_and_return:
1949         if (refcounted)
1950                 sock_put(sk);
1951
1952         return ret;
1953
1954 no_tcp_socket:
1955         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1956                 goto discard_it;
1957
1958         tcp_v4_fill_cb(skb, iph, th);
1959
1960         if (tcp_checksum_complete(skb)) {
1961 csum_error:
1962                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1963 bad_packet:
1964                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1965         } else {
1966                 tcp_v4_send_reset(NULL, skb);
1967         }
1968
1969 discard_it:
1970         /* Discard frame. */
1971         kfree_skb(skb);
1972         return 0;
1973
1974 discard_and_relse:
1975         sk_drops_add(sk, skb);
1976         if (refcounted)
1977                 sock_put(sk);
1978         goto discard_it;
1979
1980 do_time_wait:
1981         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1982                 inet_twsk_put(inet_twsk(sk));
1983                 goto discard_it;
1984         }
1985
1986         tcp_v4_fill_cb(skb, iph, th);
1987
1988         if (tcp_checksum_complete(skb)) {
1989                 inet_twsk_put(inet_twsk(sk));
1990                 goto csum_error;
1991         }
1992         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1993         case TCP_TW_SYN: {
1994                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1995                                                         &tcp_hashinfo, skb,
1996                                                         __tcp_hdrlen(th),
1997                                                         iph->saddr, th->source,
1998                                                         iph->daddr, th->dest,
1999                                                         inet_iif(skb),
2000                                                         sdif);
2001                 if (sk2) {
2002                         inet_twsk_deschedule_put(inet_twsk(sk));
2003                         sk = sk2;
2004                         tcp_v4_restore_cb(skb);
2005                         refcounted = false;
2006                         goto process;
2007                 }
2008         }
2009                 /* to ACK */
2010                 /* fall through */
2011         case TCP_TW_ACK:
2012                 tcp_v4_timewait_ack(sk, skb);
2013                 break;
2014         case TCP_TW_RST:
2015                 tcp_v4_send_reset(sk, skb);
2016                 inet_twsk_deschedule_put(inet_twsk(sk));
2017                 goto discard_it;
2018         case TCP_TW_SUCCESS:;
2019         }
2020         goto discard_it;
2021 }
2022
2023 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2024         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2025         .twsk_unique    = tcp_twsk_unique,
2026         .twsk_destructor= tcp_twsk_destructor,
2027 };
2028
2029 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2030 {
2031         struct dst_entry *dst = skb_dst(skb);
2032
2033         if (dst && dst_hold_safe(dst)) {
2034                 sk->sk_rx_dst = dst;
2035                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2036         }
2037 }
2038 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2039
2040 const struct inet_connection_sock_af_ops ipv4_specific = {
2041         .queue_xmit        = ip_queue_xmit,
2042         .send_check        = tcp_v4_send_check,
2043         .rebuild_header    = inet_sk_rebuild_header,
2044         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2045         .conn_request      = tcp_v4_conn_request,
2046         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2047         .net_header_len    = sizeof(struct iphdr),
2048         .setsockopt        = ip_setsockopt,
2049         .getsockopt        = ip_getsockopt,
2050         .addr2sockaddr     = inet_csk_addr2sockaddr,
2051         .sockaddr_len      = sizeof(struct sockaddr_in),
2052 #ifdef CONFIG_COMPAT
2053         .compat_setsockopt = compat_ip_setsockopt,
2054         .compat_getsockopt = compat_ip_getsockopt,
2055 #endif
2056         .mtu_reduced       = tcp_v4_mtu_reduced,
2057 };
2058 EXPORT_SYMBOL(ipv4_specific);
2059
2060 #ifdef CONFIG_TCP_MD5SIG
2061 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2062         .md5_lookup             = tcp_v4_md5_lookup,
2063         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2064         .md5_parse              = tcp_v4_parse_md5_keys,
2065 };
2066 #endif
2067
2068 /* NOTE: A lot of things set to zero explicitly by call to
2069  *       sk_alloc() so need not be done here.
2070  */
2071 static int tcp_v4_init_sock(struct sock *sk)
2072 {
2073         struct inet_connection_sock *icsk = inet_csk(sk);
2074
2075         tcp_init_sock(sk);
2076
2077         icsk->icsk_af_ops = &ipv4_specific;
2078
2079 #ifdef CONFIG_TCP_MD5SIG
2080         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2081 #endif
2082
2083         return 0;
2084 }
2085
2086 void tcp_v4_destroy_sock(struct sock *sk)
2087 {
2088         struct tcp_sock *tp = tcp_sk(sk);
2089
2090         trace_tcp_destroy_sock(sk);
2091
2092         tcp_clear_xmit_timers(sk);
2093
2094         tcp_cleanup_congestion_control(sk);
2095
2096         tcp_cleanup_ulp(sk);
2097
2098         /* Cleanup up the write buffer. */
2099         tcp_write_queue_purge(sk);
2100
2101         /* Check if we want to disable active TFO */
2102         tcp_fastopen_active_disable_ofo_check(sk);
2103
2104         /* Cleans up our, hopefully empty, out_of_order_queue. */
2105         skb_rbtree_purge(&tp->out_of_order_queue);
2106
2107 #ifdef CONFIG_TCP_MD5SIG
2108         /* Clean up the MD5 key list, if any */
2109         if (tp->md5sig_info) {
2110                 tcp_clear_md5_list(sk);
2111                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2112                 tp->md5sig_info = NULL;
2113         }
2114 #endif
2115
2116         /* Clean up a referenced TCP bind bucket. */
2117         if (inet_csk(sk)->icsk_bind_hash)
2118                 inet_put_port(sk);
2119
2120         BUG_ON(tp->fastopen_rsk);
2121
2122         /* If socket is aborted during connect operation */
2123         tcp_free_fastopen_req(tp);
2124         tcp_fastopen_destroy_cipher(sk);
2125         tcp_saved_syn_free(tp);
2126
2127         sk_sockets_allocated_dec(sk);
2128 }
2129 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2130
2131 #ifdef CONFIG_PROC_FS
2132 /* Proc filesystem TCP sock list dumping. */
2133
2134 /*
2135  * Get next listener socket follow cur.  If cur is NULL, get first socket
2136  * starting from bucket given in st->bucket; when st->bucket is zero the
2137  * very first socket in the hash table is returned.
2138  */
2139 static void *listening_get_next(struct seq_file *seq, void *cur)
2140 {
2141         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2142         struct tcp_iter_state *st = seq->private;
2143         struct net *net = seq_file_net(seq);
2144         struct inet_listen_hashbucket *ilb;
2145         struct sock *sk = cur;
2146
2147         if (!sk) {
2148 get_head:
2149                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2150                 spin_lock(&ilb->lock);
2151                 sk = sk_head(&ilb->head);
2152                 st->offset = 0;
2153                 goto get_sk;
2154         }
2155         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2156         ++st->num;
2157         ++st->offset;
2158
2159         sk = sk_next(sk);
2160 get_sk:
2161         sk_for_each_from(sk) {
2162                 if (!net_eq(sock_net(sk), net))
2163                         continue;
2164                 if (sk->sk_family == afinfo->family)
2165                         return sk;
2166         }
2167         spin_unlock(&ilb->lock);
2168         st->offset = 0;
2169         if (++st->bucket < INET_LHTABLE_SIZE)
2170                 goto get_head;
2171         return NULL;
2172 }
2173
2174 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2175 {
2176         struct tcp_iter_state *st = seq->private;
2177         void *rc;
2178
2179         st->bucket = 0;
2180         st->offset = 0;
2181         rc = listening_get_next(seq, NULL);
2182
2183         while (rc && *pos) {
2184                 rc = listening_get_next(seq, rc);
2185                 --*pos;
2186         }
2187         return rc;
2188 }
2189
2190 static inline bool empty_bucket(const struct tcp_iter_state *st)
2191 {
2192         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2193 }
2194
2195 /*
2196  * Get first established socket starting from bucket given in st->bucket.
2197  * If st->bucket is zero, the very first socket in the hash is returned.
2198  */
2199 static void *established_get_first(struct seq_file *seq)
2200 {
2201         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2202         struct tcp_iter_state *st = seq->private;
2203         struct net *net = seq_file_net(seq);
2204         void *rc = NULL;
2205
2206         st->offset = 0;
2207         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2208                 struct sock *sk;
2209                 struct hlist_nulls_node *node;
2210                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2211
2212                 /* Lockless fast path for the common case of empty buckets */
2213                 if (empty_bucket(st))
2214                         continue;
2215
2216                 spin_lock_bh(lock);
2217                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2218                         if (sk->sk_family != afinfo->family ||
2219                             !net_eq(sock_net(sk), net)) {
2220                                 continue;
2221                         }
2222                         rc = sk;
2223                         goto out;
2224                 }
2225                 spin_unlock_bh(lock);
2226         }
2227 out:
2228         return rc;
2229 }
2230
2231 static void *established_get_next(struct seq_file *seq, void *cur)
2232 {
2233         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2234         struct sock *sk = cur;
2235         struct hlist_nulls_node *node;
2236         struct tcp_iter_state *st = seq->private;
2237         struct net *net = seq_file_net(seq);
2238
2239         ++st->num;
2240         ++st->offset;
2241
2242         sk = sk_nulls_next(sk);
2243
2244         sk_nulls_for_each_from(sk, node) {
2245                 if (sk->sk_family == afinfo->family &&
2246                     net_eq(sock_net(sk), net))
2247                         return sk;
2248         }
2249
2250         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2251         ++st->bucket;
2252         return established_get_first(seq);
2253 }
2254
2255 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2256 {
2257         struct tcp_iter_state *st = seq->private;
2258         void *rc;
2259
2260         st->bucket = 0;
2261         rc = established_get_first(seq);
2262
2263         while (rc && pos) {
2264                 rc = established_get_next(seq, rc);
2265                 --pos;
2266         }
2267         return rc;
2268 }
2269
2270 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2271 {
2272         void *rc;
2273         struct tcp_iter_state *st = seq->private;
2274
2275         st->state = TCP_SEQ_STATE_LISTENING;
2276         rc        = listening_get_idx(seq, &pos);
2277
2278         if (!rc) {
2279                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2280                 rc        = established_get_idx(seq, pos);
2281         }
2282
2283         return rc;
2284 }
2285
2286 static void *tcp_seek_last_pos(struct seq_file *seq)
2287 {
2288         struct tcp_iter_state *st = seq->private;
2289         int offset = st->offset;
2290         int orig_num = st->num;
2291         void *rc = NULL;
2292
2293         switch (st->state) {
2294         case TCP_SEQ_STATE_LISTENING:
2295                 if (st->bucket >= INET_LHTABLE_SIZE)
2296                         break;
2297                 st->state = TCP_SEQ_STATE_LISTENING;
2298                 rc = listening_get_next(seq, NULL);
2299                 while (offset-- && rc)
2300                         rc = listening_get_next(seq, rc);
2301                 if (rc)
2302                         break;
2303                 st->bucket = 0;
2304                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2305                 /* Fallthrough */
2306         case TCP_SEQ_STATE_ESTABLISHED:
2307                 if (st->bucket > tcp_hashinfo.ehash_mask)
2308                         break;
2309                 rc = established_get_first(seq);
2310                 while (offset-- && rc)
2311                         rc = established_get_next(seq, rc);
2312         }
2313
2314         st->num = orig_num;
2315
2316         return rc;
2317 }
2318
2319 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2320 {
2321         struct tcp_iter_state *st = seq->private;
2322         void *rc;
2323
2324         if (*pos && *pos == st->last_pos) {
2325                 rc = tcp_seek_last_pos(seq);
2326                 if (rc)
2327                         goto out;
2328         }
2329
2330         st->state = TCP_SEQ_STATE_LISTENING;
2331         st->num = 0;
2332         st->bucket = 0;
2333         st->offset = 0;
2334         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2335
2336 out:
2337         st->last_pos = *pos;
2338         return rc;
2339 }
2340 EXPORT_SYMBOL(tcp_seq_start);
2341
2342 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2343 {
2344         struct tcp_iter_state *st = seq->private;
2345         void *rc = NULL;
2346
2347         if (v == SEQ_START_TOKEN) {
2348                 rc = tcp_get_idx(seq, 0);
2349                 goto out;
2350         }
2351
2352         switch (st->state) {
2353         case TCP_SEQ_STATE_LISTENING:
2354                 rc = listening_get_next(seq, v);
2355                 if (!rc) {
2356                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2357                         st->bucket = 0;
2358                         st->offset = 0;
2359                         rc        = established_get_first(seq);
2360                 }
2361                 break;
2362         case TCP_SEQ_STATE_ESTABLISHED:
2363                 rc = established_get_next(seq, v);
2364                 break;
2365         }
2366 out:
2367         ++*pos;
2368         st->last_pos = *pos;
2369         return rc;
2370 }
2371 EXPORT_SYMBOL(tcp_seq_next);
2372
2373 void tcp_seq_stop(struct seq_file *seq, void *v)
2374 {
2375         struct tcp_iter_state *st = seq->private;
2376
2377         switch (st->state) {
2378         case TCP_SEQ_STATE_LISTENING:
2379                 if (v != SEQ_START_TOKEN)
2380                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2381                 break;
2382         case TCP_SEQ_STATE_ESTABLISHED:
2383                 if (v)
2384                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2385                 break;
2386         }
2387 }
2388 EXPORT_SYMBOL(tcp_seq_stop);
2389
2390 static void get_openreq4(const struct request_sock *req,
2391                          struct seq_file *f, int i)
2392 {
2393         const struct inet_request_sock *ireq = inet_rsk(req);
2394         long delta = req->rsk_timer.expires - jiffies;
2395
2396         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2397                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2398                 i,
2399                 ireq->ir_loc_addr,
2400                 ireq->ir_num,
2401                 ireq->ir_rmt_addr,
2402                 ntohs(ireq->ir_rmt_port),
2403                 TCP_SYN_RECV,
2404                 0, 0, /* could print option size, but that is af dependent. */
2405                 1,    /* timers active (only the expire timer) */
2406                 jiffies_delta_to_clock_t(delta),
2407                 req->num_timeout,
2408                 from_kuid_munged(seq_user_ns(f),
2409                                  sock_i_uid(req->rsk_listener)),
2410                 0,  /* non standard timer */
2411                 0, /* open_requests have no inode */
2412                 0,
2413                 req);
2414 }
2415
2416 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2417 {
2418         int timer_active;
2419         unsigned long timer_expires;
2420         const struct tcp_sock *tp = tcp_sk(sk);
2421         const struct inet_connection_sock *icsk = inet_csk(sk);
2422         const struct inet_sock *inet = inet_sk(sk);
2423         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2424         __be32 dest = inet->inet_daddr;
2425         __be32 src = inet->inet_rcv_saddr;
2426         __u16 destp = ntohs(inet->inet_dport);
2427         __u16 srcp = ntohs(inet->inet_sport);
2428         int rx_queue;
2429         int state;
2430
2431         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2432             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2433             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2434                 timer_active    = 1;
2435                 timer_expires   = icsk->icsk_timeout;
2436         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2437                 timer_active    = 4;
2438                 timer_expires   = icsk->icsk_timeout;
2439         } else if (timer_pending(&sk->sk_timer)) {
2440                 timer_active    = 2;
2441                 timer_expires   = sk->sk_timer.expires;
2442         } else {
2443                 timer_active    = 0;
2444                 timer_expires = jiffies;
2445         }
2446
2447         state = inet_sk_state_load(sk);
2448         if (state == TCP_LISTEN)
2449                 rx_queue = sk->sk_ack_backlog;
2450         else
2451                 /* Because we don't lock the socket,
2452                  * we might find a transient negative value.
2453                  */
2454                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2455
2456         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2457                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2458                 i, src, srcp, dest, destp, state,
2459                 tp->write_seq - tp->snd_una,
2460                 rx_queue,
2461                 timer_active,
2462                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2463                 icsk->icsk_retransmits,
2464                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2465                 icsk->icsk_probes_out,
2466                 sock_i_ino(sk),
2467                 refcount_read(&sk->sk_refcnt), sk,
2468                 jiffies_to_clock_t(icsk->icsk_rto),
2469                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2470                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2471                 tp->snd_cwnd,
2472                 state == TCP_LISTEN ?
2473                     fastopenq->max_qlen :
2474                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2475 }
2476
2477 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2478                                struct seq_file *f, int i)
2479 {
2480         long delta = tw->tw_timer.expires - jiffies;
2481         __be32 dest, src;
2482         __u16 destp, srcp;
2483
2484         dest  = tw->tw_daddr;
2485         src   = tw->tw_rcv_saddr;
2486         destp = ntohs(tw->tw_dport);
2487         srcp  = ntohs(tw->tw_sport);
2488
2489         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2490                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2491                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2492                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2493                 refcount_read(&tw->tw_refcnt), tw);
2494 }
2495
2496 #define TMPSZ 150
2497
2498 static int tcp4_seq_show(struct seq_file *seq, void *v)
2499 {
2500         struct tcp_iter_state *st;
2501         struct sock *sk = v;
2502
2503         seq_setwidth(seq, TMPSZ - 1);
2504         if (v == SEQ_START_TOKEN) {
2505                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2506                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2507                            "inode");
2508                 goto out;
2509         }
2510         st = seq->private;
2511
2512         if (sk->sk_state == TCP_TIME_WAIT)
2513                 get_timewait4_sock(v, seq, st->num);
2514         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2515                 get_openreq4(v, seq, st->num);
2516         else
2517                 get_tcp4_sock(v, seq, st->num);
2518 out:
2519         seq_pad(seq, '\n');
2520         return 0;
2521 }
2522
2523 static const struct seq_operations tcp4_seq_ops = {
2524         .show           = tcp4_seq_show,
2525         .start          = tcp_seq_start,
2526         .next           = tcp_seq_next,
2527         .stop           = tcp_seq_stop,
2528 };
2529
2530 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2531         .family         = AF_INET,
2532 };
2533
2534 static int __net_init tcp4_proc_init_net(struct net *net)
2535 {
2536         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2537                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2538                 return -ENOMEM;
2539         return 0;
2540 }
2541
2542 static void __net_exit tcp4_proc_exit_net(struct net *net)
2543 {
2544         remove_proc_entry("tcp", net->proc_net);
2545 }
2546
2547 static struct pernet_operations tcp4_net_ops = {
2548         .init = tcp4_proc_init_net,
2549         .exit = tcp4_proc_exit_net,
2550 };
2551
2552 int __init tcp4_proc_init(void)
2553 {
2554         return register_pernet_subsys(&tcp4_net_ops);
2555 }
2556
2557 void tcp4_proc_exit(void)
2558 {
2559         unregister_pernet_subsys(&tcp4_net_ops);
2560 }
2561 #endif /* CONFIG_PROC_FS */
2562
2563 struct proto tcp_prot = {
2564         .name                   = "TCP",
2565         .owner                  = THIS_MODULE,
2566         .close                  = tcp_close,
2567         .pre_connect            = tcp_v4_pre_connect,
2568         .connect                = tcp_v4_connect,
2569         .disconnect             = tcp_disconnect,
2570         .accept                 = inet_csk_accept,
2571         .ioctl                  = tcp_ioctl,
2572         .init                   = tcp_v4_init_sock,
2573         .destroy                = tcp_v4_destroy_sock,
2574         .shutdown               = tcp_shutdown,
2575         .setsockopt             = tcp_setsockopt,
2576         .getsockopt             = tcp_getsockopt,
2577         .keepalive              = tcp_set_keepalive,
2578         .recvmsg                = tcp_recvmsg,
2579         .sendmsg                = tcp_sendmsg,
2580         .sendpage               = tcp_sendpage,
2581         .backlog_rcv            = tcp_v4_do_rcv,
2582         .release_cb             = tcp_release_cb,
2583         .hash                   = inet_hash,
2584         .unhash                 = inet_unhash,
2585         .get_port               = inet_csk_get_port,
2586         .enter_memory_pressure  = tcp_enter_memory_pressure,
2587         .leave_memory_pressure  = tcp_leave_memory_pressure,
2588         .stream_memory_free     = tcp_stream_memory_free,
2589         .sockets_allocated      = &tcp_sockets_allocated,
2590         .orphan_count           = &tcp_orphan_count,
2591         .memory_allocated       = &tcp_memory_allocated,
2592         .memory_pressure        = &tcp_memory_pressure,
2593         .sysctl_mem             = sysctl_tcp_mem,
2594         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2595         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2596         .max_header             = MAX_TCP_HEADER,
2597         .obj_size               = sizeof(struct tcp_sock),
2598         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2599         .twsk_prot              = &tcp_timewait_sock_ops,
2600         .rsk_prot               = &tcp_request_sock_ops,
2601         .h.hashinfo             = &tcp_hashinfo,
2602         .no_autobind            = true,
2603 #ifdef CONFIG_COMPAT
2604         .compat_setsockopt      = compat_tcp_setsockopt,
2605         .compat_getsockopt      = compat_tcp_getsockopt,
2606 #endif
2607         .diag_destroy           = tcp_abort,
2608 };
2609 EXPORT_SYMBOL(tcp_prot);
2610
2611 static void __net_exit tcp_sk_exit(struct net *net)
2612 {
2613         int cpu;
2614
2615         if (net->ipv4.tcp_congestion_control)
2616                 module_put(net->ipv4.tcp_congestion_control->owner);
2617
2618         for_each_possible_cpu(cpu)
2619                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2620         free_percpu(net->ipv4.tcp_sk);
2621 }
2622
2623 static int __net_init tcp_sk_init(struct net *net)
2624 {
2625         int res, cpu, cnt;
2626
2627         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2628         if (!net->ipv4.tcp_sk)
2629                 return -ENOMEM;
2630
2631         for_each_possible_cpu(cpu) {
2632                 struct sock *sk;
2633
2634                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2635                                            IPPROTO_TCP, net);
2636                 if (res)
2637                         goto fail;
2638                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2639
2640                 /* Please enforce IP_DF and IPID==0 for RST and
2641                  * ACK sent in SYN-RECV and TIME-WAIT state.
2642                  */
2643                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2644
2645                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2646         }
2647
2648         net->ipv4.sysctl_tcp_ecn = 2;
2649         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2650
2651         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2652         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2653         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2654         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2655         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2656
2657         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2658         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2659         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2660
2661         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2662         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2663         net->ipv4.sysctl_tcp_syncookies = 1;
2664         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2665         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2666         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2667         net->ipv4.sysctl_tcp_orphan_retries = 0;
2668         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2669         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2670         net->ipv4.sysctl_tcp_tw_reuse = 2;
2671
2672         cnt = tcp_hashinfo.ehash_mask + 1;
2673         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2674         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2675
2676         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2677         net->ipv4.sysctl_tcp_sack = 1;
2678         net->ipv4.sysctl_tcp_window_scaling = 1;
2679         net->ipv4.sysctl_tcp_timestamps = 1;
2680         net->ipv4.sysctl_tcp_early_retrans = 3;
2681         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2682         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2683         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2684         net->ipv4.sysctl_tcp_max_reordering = 300;
2685         net->ipv4.sysctl_tcp_dsack = 1;
2686         net->ipv4.sysctl_tcp_app_win = 31;
2687         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2688         net->ipv4.sysctl_tcp_frto = 2;
2689         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2690         /* This limits the percentage of the congestion window which we
2691          * will allow a single TSO frame to consume.  Building TSO frames
2692          * which are too large can cause TCP streams to be bursty.
2693          */
2694         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2695         /* Default TSQ limit of 16 TSO segments */
2696         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2697         /* rfc5961 challenge ack rate limiting */
2698         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2699         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2700         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2701         net->ipv4.sysctl_tcp_autocorking = 1;
2702         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2703         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2704         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2705         if (net != &init_net) {
2706                 memcpy(net->ipv4.sysctl_tcp_rmem,
2707                        init_net.ipv4.sysctl_tcp_rmem,
2708                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2709                 memcpy(net->ipv4.sysctl_tcp_wmem,
2710                        init_net.ipv4.sysctl_tcp_wmem,
2711                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2712         }
2713         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2714         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2715         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2716         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2717         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2718         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2719
2720         /* Reno is always built in */
2721         if (!net_eq(net, &init_net) &&
2722             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2723                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2724         else
2725                 net->ipv4.tcp_congestion_control = &tcp_reno;
2726
2727         return 0;
2728 fail:
2729         tcp_sk_exit(net);
2730
2731         return res;
2732 }
2733
2734 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2735 {
2736         struct net *net;
2737
2738         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2739
2740         list_for_each_entry(net, net_exit_list, exit_list)
2741                 tcp_fastopen_ctx_destroy(net);
2742 }
2743
2744 static struct pernet_operations __net_initdata tcp_sk_ops = {
2745        .init       = tcp_sk_init,
2746        .exit       = tcp_sk_exit,
2747        .exit_batch = tcp_sk_exit_batch,
2748 };
2749
2750 void __init tcp_v4_init(void)
2751 {
2752         if (register_pernet_subsys(&tcp_sk_ops))
2753                 panic("Failed to create the TCP control socket.\n");
2754 }