net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                         if (tp->write_seq == 0)
 169                                 tp->write_seq = 1;
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         tp->write_seq      = 0;
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295                                                        inet->inet_daddr,
 296                                                        inet->inet_sport,
 297                                                        usin->sin_port);
 298                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299                                                  inet->inet_saddr,
 300                                                  inet->inet_daddr);
 301         }
 302
 303         inet->inet_id = tp->write_seq ^ jiffies;
 304
 305         if (tcp_fastopen_defer_connect(sk, &err))
 306                 return err;
 307         if (err)
 308                 goto failure;
 309
 310         err = tcp_connect(sk);
 311
 312         if (err)
 313                 goto failure;
 314
 315         return 0;
 316
 317 failure:
 318         /*
 319          * This unhashes the socket and releases the local port,
 320          * if necessary.
 321          */
 322         tcp_set_state(sk, TCP_CLOSE);
 323         ip_rt_put(rt);
 324         sk->sk_route_caps = 0;
 325         inet->inet_dport = 0;
 326         return err;
 327 }
 328 EXPORT_SYMBOL(tcp_v4_connect);
 329
 330 /*
 331  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332  * It can be called through tcp_release_cb() if socket was owned by user
 333  * at the time tcp_v4_err() was called to handle ICMP message.
 334  */
 335 void tcp_v4_mtu_reduced(struct sock *sk)
 336 {
 337         struct inet_sock *inet = inet_sk(sk);
 338         struct dst_entry *dst;
 339         u32 mtu;
 340
 341         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342                 return;
 343         mtu = tcp_sk(sk)->mtu_info;
 344         dst = inet_csk_update_pmtu(sk, mtu);
 345         if (!dst)
 346                 return;
 347
 348         /* Something is about to be wrong... Remember soft error
 349          * for the case, if this connection will not able to recover.
 350          */
 351         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352                 sk->sk_err_soft = EMSGSIZE;
 353
 354         mtu = dst_mtu(dst);
 355
 356         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357             ip_sk_accept_pmtu(sk) &&
 358             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359                 tcp_sync_mss(sk, mtu);
 360
 361                 /* Resend the TCP packet because it's
 362                  * clear that the old packet has been
 363                  * dropped. This is the new "fast" path mtu
 364                  * discovery.
 365                  */
 366                 tcp_simple_retransmit(sk);
 367         } /* else let the usual retransmit timer handle it */
 368 }
 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372 {
 373         struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375         if (dst)
 376                 dst->ops->redirect(dst, sk, skb);
 377 }
 378
 379
 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382 {
 383         struct request_sock *req = inet_reqsk(sk);
 384         struct net *net = sock_net(sk);
 385
 386         /* ICMPs are not backlogged, hence we cannot get
 387          * an established socket here.
 388          */
 389         if (seq != tcp_rsk(req)->snt_isn) {
 390                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391         } else if (abort) {
 392                 /*
 393                  * Still in SYN_RECV, just remove it silently.
 394                  * There is no good way to pass the error to the newly
 395                  * created socket, and POSIX does not want network
 396                  * errors returned from accept().
 397                  */
 398                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399                 tcp_listendrop(req->rsk_listener);
 400         }
 401         reqsk_put(req);
 402 }
 403 EXPORT_SYMBOL(tcp_req_err);
 404
 405 /*
 406  * This routine is called by the ICMP module when it gets some
 407  * sort of error condition.  If err < 0 then the socket should
 408  * be closed and the error returned to the user.  If err > 0
 409  * it's just the icmp type << 8 | icmp code.  After adjustment
 410  * header points to the first 8 bytes of the tcp header.  We need
 411  * to find the appropriate port.
 412  *
 413  * The locking strategy used here is very "optimistic". When
 414  * someone else accesses the socket the ICMP is just dropped
 415  * and for some paths there is no check at all.
 416  * A more general error queue to queue errors for later handling
 417  * is probably better.
 418  *
 419  */
 420
 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422 {
 423         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425         struct inet_connection_sock *icsk;
 426         struct tcp_sock *tp;
 427         struct inet_sock *inet;
 428         const int type = icmp_hdr(icmp_skb)->type;
 429         const int code = icmp_hdr(icmp_skb)->code;
 430         struct sock *sk;
 431         struct sk_buff *skb;
 432         struct request_sock *fastopen;
 433         u32 seq, snd_una;
 434         s32 remaining;
 435         u32 delta_us;
 436         int err;
 437         struct net *net = dev_net(icmp_skb->dev);
 438
 439         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440                                        th->dest, iph->saddr, ntohs(th->source),
 441                                        inet_iif(icmp_skb), 0);
 442         if (!sk) {
 443                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444                 return -ENOENT;
 445         }
 446         if (sk->sk_state == TCP_TIME_WAIT) {
 447                 inet_twsk_put(inet_twsk(sk));
 448                 return 0;
 449         }
 450         seq = ntohl(th->seq);
 451         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453                                      type == ICMP_TIME_EXCEEDED ||
 454                                      (type == ICMP_DEST_UNREACH &&
 455                                       (code == ICMP_NET_UNREACH ||
 456                                        code == ICMP_HOST_UNREACH)));
 457                 return 0;
 458         }
 459
 460         bh_lock_sock(sk);
 461         /* If too many ICMPs get dropped on busy
 462          * servers this needs to be solved differently.
 463          * We do take care of PMTU discovery (RFC1191) special case :
 464          * we can receive locally generated ICMP messages while socket is held.
 465          */
 466         if (sock_owned_by_user(sk)) {
 467                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469         }
 470         if (sk->sk_state == TCP_CLOSE)
 471                 goto out;
 472
 473         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475                 goto out;
 476         }
 477
 478         icsk = inet_csk(sk);
 479         tp = tcp_sk(sk);
 480         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481         fastopen = tp->fastopen_rsk;
 482         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483         if (sk->sk_state != TCP_LISTEN &&
 484             !between(seq, snd_una, tp->snd_nxt)) {
 485                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486                 goto out;
 487         }
 488
 489         switch (type) {
 490         case ICMP_REDIRECT:
 491                 if (!sock_owned_by_user(sk))
 492                         do_redirect(icmp_skb, sk);
 493                 goto out;
 494         case ICMP_SOURCE_QUENCH:
 495                 /* Just silently ignore these. */
 496                 goto out;
 497         case ICMP_PARAMETERPROB:
 498                 err = EPROTO;
 499                 break;
 500         case ICMP_DEST_UNREACH:
 501                 if (code > NR_ICMP_UNREACH)
 502                         goto out;
 503
 504                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505                         /* We are not interested in TCP_LISTEN and open_requests
 506                          * (SYN-ACKs send out by Linux are always <576bytes so
 507                          * they should go through unfragmented).
 508                          */
 509                         if (sk->sk_state == TCP_LISTEN)
 510                                 goto out;
 511
 512                         tp->mtu_info = info;
 513                         if (!sock_owned_by_user(sk)) {
 514                                 tcp_v4_mtu_reduced(sk);
 515                         } else {
 516                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517                                         sock_hold(sk);
 518                         }
 519                         goto out;
 520                 }
 521
 522                 err = icmp_err_convert[code].errno;
 523                 /* check if icmp_skb allows revert of backoff
 524                  * (see draft-zimmermann-tcp-lcd) */
 525                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526                         break;
 527                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528                     !icsk->icsk_backoff || fastopen)
 529                         break;
 530
 531                 if (sock_owned_by_user(sk))
 532                         break;
 533
 534                 skb = tcp_rtx_queue_head(sk);
 535                 if (WARN_ON_ONCE(!skb))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544                 tcp_mstamp_refresh(tp);
 545                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546                 remaining = icsk->icsk_rto -
 547                             usecs_to_jiffies(delta_us);
 548
 549                 if (remaining > 0) {
 550                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551                                                   remaining, TCP_RTO_MAX);
 552                 } else {
 553                         /* RTO revert clocked out retransmission.
 554                          * Will retransmit now */
 555                         tcp_retransmit_timer(sk);
 556                 }
 557
 558                 break;
 559         case ICMP_TIME_EXCEEDED:
 560                 err = EHOSTUNREACH;
 561                 break;
 562         default:
 563                 goto out;
 564         }
 565
 566         switch (sk->sk_state) {
 567         case TCP_SYN_SENT:
 568         case TCP_SYN_RECV:
 569                 /* Only in fast or simultaneous open. If a fast open socket is
 570                  * is already accepted it is treated as a connected one below.
 571                  */
 572                 if (fastopen && !fastopen->sk)
 573                         break;
 574
 575                 if (!sock_owned_by_user(sk)) {
 576                         sk->sk_err = err;
 577
 578                         sk->sk_error_report(sk);
 579
 580                         tcp_done(sk);
 581                 } else {
 582                         sk->sk_err_soft = err;
 583                 }
 584                 goto out;
 585         }
 586
 587         /* If we've already connected we will keep trying
 588          * until we time out, or the user gives up.
 589          *
 590          * rfc1122 4.2.3.9 allows to consider as hard errors
 591          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592          * but it is obsoleted by pmtu discovery).
 593          *
 594          * Note, that in modern internet, where routing is unreliable
 595          * and in each dark corner broken firewalls sit, sending random
 596          * errors ordered by their masters even this two messages finally lose
 597          * their original sense (even Linux sends invalid PORT_UNREACHs)
 598          *
 599          * Now we are in compliance with RFCs.
 600          *                                                      --ANK (980905)
 601          */
 602
 603         inet = inet_sk(sk);
 604         if (!sock_owned_by_user(sk) && inet->recverr) {
 605                 sk->sk_err = err;
 606                 sk->sk_error_report(sk);
 607         } else  { /* Only an error on timeout */
 608                 sk->sk_err_soft = err;
 609         }
 610
 611 out:
 612         bh_unlock_sock(sk);
 613         sock_put(sk);
 614         return 0;
 615 }
 616
 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618 {
 619         struct tcphdr *th = tcp_hdr(skb);
 620
 621         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622         skb->csum_start = skb_transport_header(skb) - skb->head;
 623         skb->csum_offset = offsetof(struct tcphdr, check);
 624 }
 625
 626 /* This routine computes an IPv4 TCP checksum. */
 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628 {
 629         const struct inet_sock *inet = inet_sk(sk);
 630
 631         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632 }
 633 EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635 /*
 636  *      This routine will send an RST to the other tcp.
 637  *
 638  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639  *                    for reset.
 640  *      Answer: if a packet caused RST, it is not for a socket
 641  *              existing in our system, if it is matched to a socket,
 642  *              it is just duplicate segment or bug in other side's TCP.
 643  *              So that we build reply only basing on parameters
 644  *              arrived with segment.
 645  *      Exception: precedence violation. We do not implement it in any case.
 646  */
 647
 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649 {
 650         const struct tcphdr *th = tcp_hdr(skb);
 651         struct {
 652                 struct tcphdr th;
 653 #ifdef CONFIG_TCP_MD5SIG
 654                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655 #endif
 656         } rep;
 657         struct ip_reply_arg arg;
 658 #ifdef CONFIG_TCP_MD5SIG
 659         struct tcp_md5sig_key *key = NULL;
 660         const __u8 *hash_location = NULL;
 661         unsigned char newhash[16];
 662         int genhash;
 663         struct sock *sk1 = NULL;
 664 #endif
 665         u64 transmit_time = 0;
 666         struct sock *ctl_sk;
 667         struct net *net;
 668
 669         /* Never send a reset in response to a reset. */
 670         if (th->rst)
 671                 return;
 672
 673         /* If sk not NULL, it means we did a successful lookup and incoming
 674          * route had to be correct. prequeue might have dropped our dst.
 675          */
 676         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                 return;
 678
 679         /* Swap the send and the receive. */
 680         memset(&rep, 0, sizeof(rep));
 681         rep.th.dest   = th->source;
 682         rep.th.source = th->dest;
 683         rep.th.doff   = sizeof(struct tcphdr) / 4;
 684         rep.th.rst    = 1;
 685
 686         if (th->ack) {
 687                 rep.th.seq = th->ack_seq;
 688         } else {
 689                 rep.th.ack = 1;
 690                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                        skb->len - (th->doff << 2));
 692         }
 693
 694         memset(&arg, 0, sizeof(arg));
 695         arg.iov[0].iov_base = (unsigned char *)&rep;
 696         arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699 #ifdef CONFIG_TCP_MD5SIG
 700         rcu_read_lock();
 701         hash_location = tcp_parse_md5sig_option(th);
 702         if (sk && sk_fullsock(sk)) {
 703                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                         &ip_hdr(skb)->saddr, AF_INET);
 705         } else if (hash_location) {
 706                 /*
 707                  * active side is lost. Try to find listening socket through
 708                  * source port, and then find md5 key through listening socket.
 709                  * we are not loose security here:
 710                  * Incoming packet is checked with md5 hash with finding key,
 711                  * no RST generated if md5 hash doesn't match.
 712                  */
 713                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                              ip_hdr(skb)->saddr,
 715                                              th->source, ip_hdr(skb)->daddr,
 716                                              ntohs(th->source), inet_iif(skb),
 717                                              tcp_v4_sdif(skb));
 718                 /* don't send rst if it can't find key */
 719                 if (!sk1)
 720                         goto out;
 721
 722                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                         &ip_hdr(skb)->saddr, AF_INET);
 724                 if (!key)
 725                         goto out;
 726
 727
 728                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                         goto out;
 731
 732         }
 733
 734         if (key) {
 735                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                    (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_MD5SIG << 8) |
 738                                    TCPOLEN_MD5SIG);
 739                 /* Update length and the length the header thinks exists */
 740                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                 rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                      key, ip_hdr(skb)->saddr,
 745                                      ip_hdr(skb)->daddr, &rep.th);
 746         }
 747 #endif
 748         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                       ip_hdr(skb)->saddr, /* XXX */
 750                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754         /* When socket is gone, all binding information is lost.
 755          * routing might fail in this case. No choice here, if we choose to force
 756          * input interface, we will misroute in case of asymmetric route.
 757          */
 758         if (sk) {
 759                 arg.bound_dev_if = sk->sk_bound_dev_if;
 760                 if (sk_fullsock(sk))
 761                         trace_tcp_send_reset(sk, skb);
 762         }
 763
 764         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767         arg.tos = ip_hdr(skb)->tos;
 768         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769         local_bh_disable();
 770         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 771         if (sk) {
 772                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 774                 transmit_time = tcp_transmit_time(sk);
 775         }
 776         ip_send_unicast_reply(ctl_sk,
 777                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 778                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 779                               &arg, arg.iov[0].iov_len,
 780                               transmit_time);
 781
 782         ctl_sk->sk_mark = 0;
 783         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 784         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 785         local_bh_enable();
 786
 787 #ifdef CONFIG_TCP_MD5SIG
 788 out:
 789         rcu_read_unlock();
 790 #endif
 791 }
 792
 793 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 794    outside socket context is ugly, certainly. What can I do?
 795  */
 796
 797 static void tcp_v4_send_ack(const struct sock *sk,
 798                             struct sk_buff *skb, u32 seq, u32 ack,
 799                             u32 win, u32 tsval, u32 tsecr, int oif,
 800                             struct tcp_md5sig_key *key,
 801                             int reply_flags, u8 tos)
 802 {
 803         const struct tcphdr *th = tcp_hdr(skb);
 804         struct {
 805                 struct tcphdr th;
 806                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 807 #ifdef CONFIG_TCP_MD5SIG
 808                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 809 #endif
 810                         ];
 811         } rep;
 812         struct net *net = sock_net(sk);
 813         struct ip_reply_arg arg;
 814         struct sock *ctl_sk;
 815         u64 transmit_time;
 816
 817         memset(&rep.th, 0, sizeof(struct tcphdr));
 818         memset(&arg, 0, sizeof(arg));
 819
 820         arg.iov[0].iov_base = (unsigned char *)&rep;
 821         arg.iov[0].iov_len  = sizeof(rep.th);
 822         if (tsecr) {
 823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 824                                    (TCPOPT_TIMESTAMP << 8) |
 825                                    TCPOLEN_TIMESTAMP);
 826                 rep.opt[1] = htonl(tsval);
 827                 rep.opt[2] = htonl(tsecr);
 828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 829         }
 830
 831         /* Swap the send and the receive. */
 832         rep.th.dest    = th->source;
 833         rep.th.source  = th->dest;
 834         rep.th.doff    = arg.iov[0].iov_len / 4;
 835         rep.th.seq     = htonl(seq);
 836         rep.th.ack_seq = htonl(ack);
 837         rep.th.ack     = 1;
 838         rep.th.window  = htons(win);
 839
 840 #ifdef CONFIG_TCP_MD5SIG
 841         if (key) {
 842                 int offset = (tsecr) ? 3 : 0;
 843
 844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 845                                           (TCPOPT_NOP << 16) |
 846                                           (TCPOPT_MD5SIG << 8) |
 847                                           TCPOLEN_MD5SIG);
 848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 849                 rep.th.doff = arg.iov[0].iov_len/4;
 850
 851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 852                                     key, ip_hdr(skb)->saddr,
 853                                     ip_hdr(skb)->daddr, &rep.th);
 854         }
 855 #endif
 856         arg.flags = reply_flags;
 857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 858                                       ip_hdr(skb)->saddr, /* XXX */
 859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 861         if (oif)
 862                 arg.bound_dev_if = oif;
 863         arg.tos = tos;
 864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 865         local_bh_disable();
 866         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 867         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 868                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 869         transmit_time = tcp_transmit_time(sk);
 870         ip_send_unicast_reply(ctl_sk,
 871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 873                               &arg, arg.iov[0].iov_len,
 874                               transmit_time);
 875
 876         ctl_sk->sk_mark = 0;
 877         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 878         local_bh_enable();
 879 }
 880
 881 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 882 {
 883         struct inet_timewait_sock *tw = inet_twsk(sk);
 884         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 885
 886         tcp_v4_send_ack(sk, skb,
 887                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 888                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 889                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 890                         tcptw->tw_ts_recent,
 891                         tw->tw_bound_dev_if,
 892                         tcp_twsk_md5_key(tcptw),
 893                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 894                         tw->tw_tos
 895                         );
 896
 897         inet_twsk_put(tw);
 898 }
 899
 900 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 901                                   struct request_sock *req)
 902 {
 903         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 904          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 905          */
 906         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 907                                              tcp_sk(sk)->snd_nxt;
 908
 909         /* RFC 7323 2.3
 910          * The window field (SEG.WND) of every outgoing segment, with the
 911          * exception of <SYN> segments, MUST be right-shifted by
 912          * Rcv.Wind.Shift bits:
 913          */
 914         tcp_v4_send_ack(sk, skb, seq,
 915                         tcp_rsk(req)->rcv_nxt,
 916                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 917                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 918                         req->ts_recent,
 919                         0,
 920                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 921                                           AF_INET),
 922                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         ip_hdr(skb)->tos);
 924 }
 925
 926 /*
 927  *      Send a SYN-ACK after having received a SYN.
 928  *      This still operates on a request_sock only, not on a big
 929  *      socket.
 930  */
 931 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 932                               struct flowi *fl,
 933                               struct request_sock *req,
 934                               struct tcp_fastopen_cookie *foc,
 935                               enum tcp_synack_type synack_type)
 936 {
 937         const struct inet_request_sock *ireq = inet_rsk(req);
 938         struct flowi4 fl4;
 939         int err = -1;
 940         struct sk_buff *skb;
 941
 942         /* First, grab a route. */
 943         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 944                 return -1;
 945
 946         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 947
 948         if (skb) {
 949                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 950
 951                 rcu_read_lock();
 952                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 953                                             ireq->ir_rmt_addr,
 954                                             rcu_dereference(ireq->ireq_opt));
 955                 rcu_read_unlock();
 956                 err = net_xmit_eval(err);
 957         }
 958
 959         return err;
 960 }
 961
 962 /*
 963  *      IPv4 request_sock destructor.
 964  */
 965 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 966 {
 967         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 968 }
 969
 970 #ifdef CONFIG_TCP_MD5SIG
 971 /*
 972  * RFC2385 MD5 checksumming requires a mapping of
 973  * IP address->MD5 Key.
 974  * We need to maintain these in the sk structure.
 975  */
 976
 977 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 978 EXPORT_SYMBOL(tcp_md5_needed);
 979
 980 /* Find the Key structure for an address.  */
 981 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 982                                            const union tcp_md5_addr *addr,
 983                                            int family)
 984 {
 985         const struct tcp_sock *tp = tcp_sk(sk);
 986         struct tcp_md5sig_key *key;
 987         const struct tcp_md5sig_info *md5sig;
 988         __be32 mask;
 989         struct tcp_md5sig_key *best_match = NULL;
 990         bool match;
 991
 992         /* caller either holds rcu_read_lock() or socket lock */
 993         md5sig = rcu_dereference_check(tp->md5sig_info,
 994                                        lockdep_sock_is_held(sk));
 995         if (!md5sig)
 996                 return NULL;
 997
 998         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 999                 if (key->family != family)
1000                         continue;
1001
1002                 if (family == AF_INET) {
1003                         mask = inet_make_mask(key->prefixlen);
1004                         match = (key->addr.a4.s_addr & mask) ==
1005                                 (addr->a4.s_addr & mask);
1006 #if IS_ENABLED(CONFIG_IPV6)
1007                 } else if (family == AF_INET6) {
1008                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1009                                                   key->prefixlen);
1010 #endif
1011                 } else {
1012                         match = false;
1013                 }
1014
1015                 if (match && (!best_match ||
1016                               key->prefixlen > best_match->prefixlen))
1017                         best_match = key;
1018         }
1019         return best_match;
1020 }
1021 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1022
1023 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1024                                                       const union tcp_md5_addr *addr,
1025                                                       int family, u8 prefixlen)
1026 {
1027         const struct tcp_sock *tp = tcp_sk(sk);
1028         struct tcp_md5sig_key *key;
1029         unsigned int size = sizeof(struct in_addr);
1030         const struct tcp_md5sig_info *md5sig;
1031
1032         /* caller either holds rcu_read_lock() or socket lock */
1033         md5sig = rcu_dereference_check(tp->md5sig_info,
1034                                        lockdep_sock_is_held(sk));
1035         if (!md5sig)
1036                 return NULL;
1037 #if IS_ENABLED(CONFIG_IPV6)
1038         if (family == AF_INET6)
1039                 size = sizeof(struct in6_addr);
1040 #endif
1041         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1042                 if (key->family != family)
1043                         continue;
1044                 if (!memcmp(&key->addr, addr, size) &&
1045                     key->prefixlen == prefixlen)
1046                         return key;
1047         }
1048         return NULL;
1049 }
1050
1051 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1052                                          const struct sock *addr_sk)
1053 {
1054         const union tcp_md5_addr *addr;
1055
1056         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1057         return tcp_md5_do_lookup(sk, addr, AF_INET);
1058 }
1059 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1060
1061 /* This can be called on a newly created socket, from other files */
1062 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1063                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1064                    gfp_t gfp)
1065 {
1066         /* Add Key to the list */
1067         struct tcp_md5sig_key *key;
1068         struct tcp_sock *tp = tcp_sk(sk);
1069         struct tcp_md5sig_info *md5sig;
1070
1071         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1072         if (key) {
1073                 /* Pre-existing entry - just update that one. */
1074                 memcpy(key->key, newkey, newkeylen);
1075                 key->keylen = newkeylen;
1076                 return 0;
1077         }
1078
1079         md5sig = rcu_dereference_protected(tp->md5sig_info,
1080                                            lockdep_sock_is_held(sk));
1081         if (!md5sig) {
1082                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1083                 if (!md5sig)
1084                         return -ENOMEM;
1085
1086                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1087                 INIT_HLIST_HEAD(&md5sig->head);
1088                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1089         }
1090
1091         key = sock_kmalloc(sk, sizeof(*key), gfp);
1092         if (!key)
1093                 return -ENOMEM;
1094         if (!tcp_alloc_md5sig_pool()) {
1095                 sock_kfree_s(sk, key, sizeof(*key));
1096                 return -ENOMEM;
1097         }
1098
1099         memcpy(key->key, newkey, newkeylen);
1100         key->keylen = newkeylen;
1101         key->family = family;
1102         key->prefixlen = prefixlen;
1103         memcpy(&key->addr, addr,
1104                (family == AF_INET6) ? sizeof(struct in6_addr) :
1105                                       sizeof(struct in_addr));
1106         hlist_add_head_rcu(&key->node, &md5sig->head);
1107         return 0;
1108 }
1109 EXPORT_SYMBOL(tcp_md5_do_add);
1110
1111 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1112                    u8 prefixlen)
1113 {
1114         struct tcp_md5sig_key *key;
1115
1116         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1117         if (!key)
1118                 return -ENOENT;
1119         hlist_del_rcu(&key->node);
1120         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1121         kfree_rcu(key, rcu);
1122         return 0;
1123 }
1124 EXPORT_SYMBOL(tcp_md5_do_del);
1125
1126 static void tcp_clear_md5_list(struct sock *sk)
1127 {
1128         struct tcp_sock *tp = tcp_sk(sk);
1129         struct tcp_md5sig_key *key;
1130         struct hlist_node *n;
1131         struct tcp_md5sig_info *md5sig;
1132
1133         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1134
1135         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1136                 hlist_del_rcu(&key->node);
1137                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1138                 kfree_rcu(key, rcu);
1139         }
1140 }
1141
1142 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1143                                  char __user *optval, int optlen)
1144 {
1145         struct tcp_md5sig cmd;
1146         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1147         u8 prefixlen = 32;
1148
1149         if (optlen < sizeof(cmd))
1150                 return -EINVAL;
1151
1152         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1153                 return -EFAULT;
1154
1155         if (sin->sin_family != AF_INET)
1156                 return -EINVAL;
1157
1158         if (optname == TCP_MD5SIG_EXT &&
1159             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1160                 prefixlen = cmd.tcpm_prefixlen;
1161                 if (prefixlen > 32)
1162                         return -EINVAL;
1163         }
1164
1165         if (!cmd.tcpm_keylen)
1166                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167                                       AF_INET, prefixlen);
1168
1169         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1170                 return -EINVAL;
1171
1172         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1173                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1174                               GFP_KERNEL);
1175 }
1176
1177 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1178                                    __be32 daddr, __be32 saddr,
1179                                    const struct tcphdr *th, int nbytes)
1180 {
1181         struct tcp4_pseudohdr *bp;
1182         struct scatterlist sg;
1183         struct tcphdr *_th;
1184
1185         bp = hp->scratch;
1186         bp->saddr = saddr;
1187         bp->daddr = daddr;
1188         bp->pad = 0;
1189         bp->protocol = IPPROTO_TCP;
1190         bp->len = cpu_to_be16(nbytes);
1191
1192         _th = (struct tcphdr *)(bp + 1);
1193         memcpy(_th, th, sizeof(*th));
1194         _th->check = 0;
1195
1196         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1197         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1198                                 sizeof(*bp) + sizeof(*th));
1199         return crypto_ahash_update(hp->md5_req);
1200 }
1201
1202 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1203                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1204 {
1205         struct tcp_md5sig_pool *hp;
1206         struct ahash_request *req;
1207
1208         hp = tcp_get_md5sig_pool();
1209         if (!hp)
1210                 goto clear_hash_noput;
1211         req = hp->md5_req;
1212
1213         if (crypto_ahash_init(req))
1214                 goto clear_hash;
1215         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1216                 goto clear_hash;
1217         if (tcp_md5_hash_key(hp, key))
1218                 goto clear_hash;
1219         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1220         if (crypto_ahash_final(req))
1221                 goto clear_hash;
1222
1223         tcp_put_md5sig_pool();
1224         return 0;
1225
1226 clear_hash:
1227         tcp_put_md5sig_pool();
1228 clear_hash_noput:
1229         memset(md5_hash, 0, 16);
1230         return 1;
1231 }
1232
1233 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1234                         const struct sock *sk,
1235                         const struct sk_buff *skb)
1236 {
1237         struct tcp_md5sig_pool *hp;
1238         struct ahash_request *req;
1239         const struct tcphdr *th = tcp_hdr(skb);
1240         __be32 saddr, daddr;
1241
1242         if (sk) { /* valid for establish/request sockets */
1243                 saddr = sk->sk_rcv_saddr;
1244                 daddr = sk->sk_daddr;
1245         } else {
1246                 const struct iphdr *iph = ip_hdr(skb);
1247                 saddr = iph->saddr;
1248                 daddr = iph->daddr;
1249         }
1250
1251         hp = tcp_get_md5sig_pool();
1252         if (!hp)
1253                 goto clear_hash_noput;
1254         req = hp->md5_req;
1255
1256         if (crypto_ahash_init(req))
1257                 goto clear_hash;
1258
1259         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1260                 goto clear_hash;
1261         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1262                 goto clear_hash;
1263         if (tcp_md5_hash_key(hp, key))
1264                 goto clear_hash;
1265         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1266         if (crypto_ahash_final(req))
1267                 goto clear_hash;
1268
1269         tcp_put_md5sig_pool();
1270         return 0;
1271
1272 clear_hash:
1273         tcp_put_md5sig_pool();
1274 clear_hash_noput:
1275         memset(md5_hash, 0, 16);
1276         return 1;
1277 }
1278 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1279
1280 #endif
1281
1282 /* Called with rcu_read_lock() */
1283 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1284                                     const struct sk_buff *skb)
1285 {
1286 #ifdef CONFIG_TCP_MD5SIG
1287         /*
1288          * This gets called for each TCP segment that arrives
1289          * so we want to be efficient.
1290          * We have 3 drop cases:
1291          * o No MD5 hash and one expected.
1292          * o MD5 hash and we're not expecting one.
1293          * o MD5 hash and its wrong.
1294          */
1295         const __u8 *hash_location = NULL;
1296         struct tcp_md5sig_key *hash_expected;
1297         const struct iphdr *iph = ip_hdr(skb);
1298         const struct tcphdr *th = tcp_hdr(skb);
1299         int genhash;
1300         unsigned char newhash[16];
1301
1302         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1303                                           AF_INET);
1304         hash_location = tcp_parse_md5sig_option(th);
1305
1306         /* We've parsed the options - do we have a hash? */
1307         if (!hash_expected && !hash_location)
1308                 return false;
1309
1310         if (hash_expected && !hash_location) {
1311                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1312                 return true;
1313         }
1314
1315         if (!hash_expected && hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1317                 return true;
1318         }
1319
1320         /* Okay, so this is hash_expected and hash_location -
1321          * so we need to calculate the checksum.
1322          */
1323         genhash = tcp_v4_md5_hash_skb(newhash,
1324                                       hash_expected,
1325                                       NULL, skb);
1326
1327         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1328                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1329                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1330                                      &iph->saddr, ntohs(th->source),
1331                                      &iph->daddr, ntohs(th->dest),
1332                                      genhash ? " tcp_v4_calc_md5_hash failed"
1333                                      : "");
1334                 return true;
1335         }
1336         return false;
1337 #endif
1338         return false;
1339 }
1340
1341 static void tcp_v4_init_req(struct request_sock *req,
1342                             const struct sock *sk_listener,
1343                             struct sk_buff *skb)
1344 {
1345         struct inet_request_sock *ireq = inet_rsk(req);
1346         struct net *net = sock_net(sk_listener);
1347
1348         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1349         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1350         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1351 }
1352
1353 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1354                                           struct flowi *fl,
1355                                           const struct request_sock *req)
1356 {
1357         return inet_csk_route_req(sk, &fl->u.ip4, req);
1358 }
1359
1360 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1361         .family         =       PF_INET,
1362         .obj_size       =       sizeof(struct tcp_request_sock),
1363         .rtx_syn_ack    =       tcp_rtx_synack,
1364         .send_ack       =       tcp_v4_reqsk_send_ack,
1365         .destructor     =       tcp_v4_reqsk_destructor,
1366         .send_reset     =       tcp_v4_send_reset,
1367         .syn_ack_timeout =      tcp_syn_ack_timeout,
1368 };
1369
1370 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1371         .mss_clamp      =       TCP_MSS_DEFAULT,
1372 #ifdef CONFIG_TCP_MD5SIG
1373         .req_md5_lookup =       tcp_v4_md5_lookup,
1374         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1375 #endif
1376         .init_req       =       tcp_v4_init_req,
1377 #ifdef CONFIG_SYN_COOKIES
1378         .cookie_init_seq =      cookie_v4_init_sequence,
1379 #endif
1380         .route_req      =       tcp_v4_route_req,
1381         .init_seq       =       tcp_v4_init_seq,
1382         .init_ts_off    =       tcp_v4_init_ts_off,
1383         .send_synack    =       tcp_v4_send_synack,
1384 };
1385
1386 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1387 {
1388         /* Never answer to SYNs send to broadcast or multicast */
1389         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1390                 goto drop;
1391
1392         return tcp_conn_request(&tcp_request_sock_ops,
1393                                 &tcp_request_sock_ipv4_ops, sk, skb);
1394
1395 drop:
1396         tcp_listendrop(sk);
1397         return 0;
1398 }
1399 EXPORT_SYMBOL(tcp_v4_conn_request);
1400
1401
1402 /*
1403  * The three way handshake has completed - we got a valid synack -
1404  * now create the new socket.
1405  */
1406 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1407                                   struct request_sock *req,
1408                                   struct dst_entry *dst,
1409                                   struct request_sock *req_unhash,
1410                                   bool *own_req)
1411 {
1412         struct inet_request_sock *ireq;
1413         struct inet_sock *newinet;
1414         struct tcp_sock *newtp;
1415         struct sock *newsk;
1416 #ifdef CONFIG_TCP_MD5SIG
1417         struct tcp_md5sig_key *key;
1418 #endif
1419         struct ip_options_rcu *inet_opt;
1420
1421         if (sk_acceptq_is_full(sk))
1422                 goto exit_overflow;
1423
1424         newsk = tcp_create_openreq_child(sk, req, skb);
1425         if (!newsk)
1426                 goto exit_nonewsk;
1427
1428         newsk->sk_gso_type = SKB_GSO_TCPV4;
1429         inet_sk_rx_dst_set(newsk, skb);
1430
1431         newtp                 = tcp_sk(newsk);
1432         newinet               = inet_sk(newsk);
1433         ireq                  = inet_rsk(req);
1434         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1435         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1436         newsk->sk_bound_dev_if = ireq->ir_iif;
1437         newinet->inet_saddr   = ireq->ir_loc_addr;
1438         inet_opt              = rcu_dereference(ireq->ireq_opt);
1439         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1440         newinet->mc_index     = inet_iif(skb);
1441         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1442         newinet->rcv_tos      = ip_hdr(skb)->tos;
1443         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444         if (inet_opt)
1445                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446         newinet->inet_id = newtp->write_seq ^ jiffies;
1447
1448         if (!dst) {
1449                 dst = inet_csk_route_child_sock(sk, newsk, req);
1450                 if (!dst)
1451                         goto put_and_exit;
1452         } else {
1453                 /* syncookie case : see end of cookie_v4_check() */
1454         }
1455         sk_setup_caps(newsk, dst);
1456
1457         tcp_ca_openreq_child(newsk, dst);
1458
1459         tcp_sync_mss(newsk, dst_mtu(dst));
1460         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1461
1462         tcp_initialize_rcv_mss(newsk);
1463
1464 #ifdef CONFIG_TCP_MD5SIG
1465         /* Copy over the MD5 key from the original socket */
1466         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1467                                 AF_INET);
1468         if (key) {
1469                 /*
1470                  * We're using one, so create a matching key
1471                  * on the newsk structure. If we fail to get
1472                  * memory, then we end up not copying the key
1473                  * across. Shucks.
1474                  */
1475                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1476                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1477                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1478         }
1479 #endif
1480
1481         if (__inet_inherit_port(sk, newsk) < 0)
1482                 goto put_and_exit;
1483         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1484         if (likely(*own_req)) {
1485                 tcp_move_syn(newtp, req);
1486                 ireq->ireq_opt = NULL;
1487         } else {
1488                 newinet->inet_opt = NULL;
1489         }
1490         return newsk;
1491
1492 exit_overflow:
1493         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1494 exit_nonewsk:
1495         dst_release(dst);
1496 exit:
1497         tcp_listendrop(sk);
1498         return NULL;
1499 put_and_exit:
1500         newinet->inet_opt = NULL;
1501         inet_csk_prepare_forced_close(newsk);
1502         tcp_done(newsk);
1503         goto exit;
1504 }
1505 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1506
1507 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1508 {
1509 #ifdef CONFIG_SYN_COOKIES
1510         const struct tcphdr *th = tcp_hdr(skb);
1511
1512         if (!th->syn)
1513                 sk = cookie_v4_check(sk, skb);
1514 #endif
1515         return sk;
1516 }
1517
1518 /* The socket must have it's spinlock held when we get
1519  * here, unless it is a TCP_LISTEN socket.
1520  *
1521  * We have a potential double-lock case here, so even when
1522  * doing backlog processing we use the BH locking scheme.
1523  * This is because we cannot sleep with the original spinlock
1524  * held.
1525  */
1526 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1527 {
1528         struct sock *rsk;
1529
1530         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1531                 struct dst_entry *dst = sk->sk_rx_dst;
1532
1533                 sock_rps_save_rxhash(sk, skb);
1534                 sk_mark_napi_id(sk, skb);
1535                 if (dst) {
1536                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1537                             !dst->ops->check(dst, 0)) {
1538                                 dst_release(dst);
1539                                 sk->sk_rx_dst = NULL;
1540                         }
1541                 }
1542                 tcp_rcv_established(sk, skb);
1543                 return 0;
1544         }
1545
1546         if (tcp_checksum_complete(skb))
1547                 goto csum_err;
1548
1549         if (sk->sk_state == TCP_LISTEN) {
1550                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1551
1552                 if (!nsk)
1553                         goto discard;
1554                 if (nsk != sk) {
1555                         if (tcp_child_process(sk, nsk, skb)) {
1556                                 rsk = nsk;
1557                                 goto reset;
1558                         }
1559                         return 0;
1560                 }
1561         } else
1562                 sock_rps_save_rxhash(sk, skb);
1563
1564         if (tcp_rcv_state_process(sk, skb)) {
1565                 rsk = sk;
1566                 goto reset;
1567         }
1568         return 0;
1569
1570 reset:
1571         tcp_v4_send_reset(rsk, skb);
1572 discard:
1573         kfree_skb(skb);
1574         /* Be careful here. If this function gets more complicated and
1575          * gcc suffers from register pressure on the x86, sk (in %ebx)
1576          * might be destroyed here. This current version compiles correctly,
1577          * but you have been warned.
1578          */
1579         return 0;
1580
1581 csum_err:
1582         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1583         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1584         goto discard;
1585 }
1586 EXPORT_SYMBOL(tcp_v4_do_rcv);
1587
1588 int tcp_v4_early_demux(struct sk_buff *skb)
1589 {
1590         const struct iphdr *iph;
1591         const struct tcphdr *th;
1592         struct sock *sk;
1593
1594         if (skb->pkt_type != PACKET_HOST)
1595                 return 0;
1596
1597         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1598                 return 0;
1599
1600         iph = ip_hdr(skb);
1601         th = tcp_hdr(skb);
1602
1603         if (th->doff < sizeof(struct tcphdr) / 4)
1604                 return 0;
1605
1606         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1607                                        iph->saddr, th->source,
1608                                        iph->daddr, ntohs(th->dest),
1609                                        skb->skb_iif, inet_sdif(skb));
1610         if (sk) {
1611                 skb->sk = sk;
1612                 skb->destructor = sock_edemux;
1613                 if (sk_fullsock(sk)) {
1614                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1615
1616                         if (dst)
1617                                 dst = dst_check(dst, 0);
1618                         if (dst &&
1619                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1620                                 skb_dst_set_noref(skb, dst);
1621                 }
1622         }
1623         return 0;
1624 }
1625
1626 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1627 {
1628         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1629         struct skb_shared_info *shinfo;
1630         const struct tcphdr *th;
1631         struct tcphdr *thtail;
1632         struct sk_buff *tail;
1633         unsigned int hdrlen;
1634         bool fragstolen;
1635         u32 gso_segs;
1636         int delta;
1637
1638         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1639          * we can fix skb->truesize to its real value to avoid future drops.
1640          * This is valid because skb is not yet charged to the socket.
1641          * It has been noticed pure SACK packets were sometimes dropped
1642          * (if cooked by drivers without copybreak feature).
1643          */
1644         skb_condense(skb);
1645
1646         skb_dst_drop(skb);
1647
1648         if (unlikely(tcp_checksum_complete(skb))) {
1649                 bh_unlock_sock(sk);
1650                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1651                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1652                 return true;
1653         }
1654
1655         /* Attempt coalescing to last skb in backlog, even if we are
1656          * above the limits.
1657          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1658          */
1659         th = (const struct tcphdr *)skb->data;
1660         hdrlen = th->doff * 4;
1661         shinfo = skb_shinfo(skb);
1662
1663         if (!shinfo->gso_size)
1664                 shinfo->gso_size = skb->len - hdrlen;
1665
1666         if (!shinfo->gso_segs)
1667                 shinfo->gso_segs = 1;
1668
1669         tail = sk->sk_backlog.tail;
1670         if (!tail)
1671                 goto no_coalesce;
1672         thtail = (struct tcphdr *)tail->data;
1673
1674         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1675             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1676             ((TCP_SKB_CB(tail)->tcp_flags |
1677               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1678             !((TCP_SKB_CB(tail)->tcp_flags &
1679               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1680             ((TCP_SKB_CB(tail)->tcp_flags ^
1681               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1682 #ifdef CONFIG_TLS_DEVICE
1683             tail->decrypted != skb->decrypted ||
1684 #endif
1685             thtail->doff != th->doff ||
1686             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1687                 goto no_coalesce;
1688
1689         __skb_pull(skb, hdrlen);
1690         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1691                 thtail->window = th->window;
1692
1693                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1694
1695                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1696                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1697
1698                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1699                  * thtail->fin, so that the fast path in tcp_rcv_established()
1700                  * is not entered if we append a packet with a FIN.
1701                  * SYN, RST, URG are not present.
1702                  * ACK is set on both packets.
1703                  * PSH : we do not really care in TCP stack,
1704                  *       at least for 'GRO' packets.
1705                  */
1706                 thtail->fin |= th->fin;
1707                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1708
1709                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1710                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1711                         tail->tstamp = skb->tstamp;
1712                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1713                 }
1714
1715                 /* Not as strict as GRO. We only need to carry mss max value */
1716                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1717                                                  skb_shinfo(tail)->gso_size);
1718
1719                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1720                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1721
1722                 sk->sk_backlog.len += delta;
1723                 __NET_INC_STATS(sock_net(sk),
1724                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1725                 kfree_skb_partial(skb, fragstolen);
1726                 return false;
1727         }
1728         __skb_push(skb, hdrlen);
1729
1730 no_coalesce:
1731         /* Only socket owner can try to collapse/prune rx queues
1732          * to reduce memory overhead, so add a little headroom here.
1733          * Few sockets backlog are possibly concurrently non empty.
1734          */
1735         limit += 64*1024;
1736
1737         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1738                 bh_unlock_sock(sk);
1739                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1740                 return true;
1741         }
1742         return false;
1743 }
1744 EXPORT_SYMBOL(tcp_add_backlog);
1745
1746 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1747 {
1748         struct tcphdr *th = (struct tcphdr *)skb->data;
1749
1750         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1751 }
1752 EXPORT_SYMBOL(tcp_filter);
1753
1754 static void tcp_v4_restore_cb(struct sk_buff *skb)
1755 {
1756         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1757                 sizeof(struct inet_skb_parm));
1758 }
1759
1760 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1761                            const struct tcphdr *th)
1762 {
1763         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1764          * barrier() makes sure compiler wont play fool^Waliasing games.
1765          */
1766         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1767                 sizeof(struct inet_skb_parm));
1768         barrier();
1769
1770         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1771         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1772                                     skb->len - th->doff * 4);
1773         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1774         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1775         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1776         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1777         TCP_SKB_CB(skb)->sacked  = 0;
1778         TCP_SKB_CB(skb)->has_rxtstamp =
1779                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1780 }
1781
1782 /*
1783  *      From tcp_input.c
1784  */
1785
1786 int tcp_v4_rcv(struct sk_buff *skb)
1787 {
1788         struct net *net = dev_net(skb->dev);
1789         struct sk_buff *skb_to_free;
1790         int sdif = inet_sdif(skb);
1791         const struct iphdr *iph;
1792         const struct tcphdr *th;
1793         bool refcounted;
1794         struct sock *sk;
1795         int ret;
1796
1797         if (skb->pkt_type != PACKET_HOST)
1798                 goto discard_it;
1799
1800         /* Count it even if it's bad */
1801         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1802
1803         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1804                 goto discard_it;
1805
1806         th = (const struct tcphdr *)skb->data;
1807
1808         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1809                 goto bad_packet;
1810         if (!pskb_may_pull(skb, th->doff * 4))
1811                 goto discard_it;
1812
1813         /* An explanation is required here, I think.
1814          * Packet length and doff are validated by header prediction,
1815          * provided case of th->doff==0 is eliminated.
1816          * So, we defer the checks. */
1817
1818         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1819                 goto csum_error;
1820
1821         th = (const struct tcphdr *)skb->data;
1822         iph = ip_hdr(skb);
1823 lookup:
1824         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1825                                th->dest, sdif, &refcounted);
1826         if (!sk)
1827                 goto no_tcp_socket;
1828
1829 process:
1830         if (sk->sk_state == TCP_TIME_WAIT)
1831                 goto do_time_wait;
1832
1833         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1834                 struct request_sock *req = inet_reqsk(sk);
1835                 bool req_stolen = false;
1836                 struct sock *nsk;
1837
1838                 sk = req->rsk_listener;
1839                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1840                         sk_drops_add(sk, skb);
1841                         reqsk_put(req);
1842                         goto discard_it;
1843                 }
1844                 if (tcp_checksum_complete(skb)) {
1845                         reqsk_put(req);
1846                         goto csum_error;
1847                 }
1848                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1849                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1850                         goto lookup;
1851                 }
1852                 /* We own a reference on the listener, increase it again
1853                  * as we might lose it too soon.
1854                  */
1855                 sock_hold(sk);
1856                 refcounted = true;
1857                 nsk = NULL;
1858                 if (!tcp_filter(sk, skb)) {
1859                         th = (const struct tcphdr *)skb->data;
1860                         iph = ip_hdr(skb);
1861                         tcp_v4_fill_cb(skb, iph, th);
1862                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1863                 }
1864                 if (!nsk) {
1865                         reqsk_put(req);
1866                         if (req_stolen) {
1867                                 /* Another cpu got exclusive access to req
1868                                  * and created a full blown socket.
1869                                  * Try to feed this packet to this socket
1870                                  * instead of discarding it.
1871                                  */
1872                                 tcp_v4_restore_cb(skb);
1873                                 sock_put(sk);
1874                                 goto lookup;
1875                         }
1876                         goto discard_and_relse;
1877                 }
1878                 if (nsk == sk) {
1879                         reqsk_put(req);
1880                         tcp_v4_restore_cb(skb);
1881                 } else if (tcp_child_process(sk, nsk, skb)) {
1882                         tcp_v4_send_reset(nsk, skb);
1883                         goto discard_and_relse;
1884                 } else {
1885                         sock_put(sk);
1886                         return 0;
1887                 }
1888         }
1889         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1890                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1891                 goto discard_and_relse;
1892         }
1893
1894         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1895                 goto discard_and_relse;
1896
1897         if (tcp_v4_inbound_md5_hash(sk, skb))
1898                 goto discard_and_relse;
1899
1900         nf_reset(skb);
1901
1902         if (tcp_filter(sk, skb))
1903                 goto discard_and_relse;
1904         th = (const struct tcphdr *)skb->data;
1905         iph = ip_hdr(skb);
1906         tcp_v4_fill_cb(skb, iph, th);
1907
1908         skb->dev = NULL;
1909
1910         if (sk->sk_state == TCP_LISTEN) {
1911                 ret = tcp_v4_do_rcv(sk, skb);
1912                 goto put_and_return;
1913         }
1914
1915         sk_incoming_cpu_update(sk);
1916
1917         bh_lock_sock_nested(sk);
1918         tcp_segs_in(tcp_sk(sk), skb);
1919         ret = 0;
1920         if (!sock_owned_by_user(sk)) {
1921                 skb_to_free = sk->sk_rx_skb_cache;
1922                 sk->sk_rx_skb_cache = NULL;
1923                 ret = tcp_v4_do_rcv(sk, skb);
1924         } else {
1925                 if (tcp_add_backlog(sk, skb))
1926                         goto discard_and_relse;
1927                 skb_to_free = NULL;
1928         }
1929         bh_unlock_sock(sk);
1930         if (skb_to_free)
1931                 __kfree_skb(skb_to_free);
1932
1933 put_and_return:
1934         if (refcounted)
1935                 sock_put(sk);
1936
1937         return ret;
1938
1939 no_tcp_socket:
1940         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1941                 goto discard_it;
1942
1943         tcp_v4_fill_cb(skb, iph, th);
1944
1945         if (tcp_checksum_complete(skb)) {
1946 csum_error:
1947                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1948 bad_packet:
1949                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1950         } else {
1951                 tcp_v4_send_reset(NULL, skb);
1952         }
1953
1954 discard_it:
1955         /* Discard frame. */
1956         kfree_skb(skb);
1957         return 0;
1958
1959 discard_and_relse:
1960         sk_drops_add(sk, skb);
1961         if (refcounted)
1962                 sock_put(sk);
1963         goto discard_it;
1964
1965 do_time_wait:
1966         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1967                 inet_twsk_put(inet_twsk(sk));
1968                 goto discard_it;
1969         }
1970
1971         tcp_v4_fill_cb(skb, iph, th);
1972
1973         if (tcp_checksum_complete(skb)) {
1974                 inet_twsk_put(inet_twsk(sk));
1975                 goto csum_error;
1976         }
1977         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1978         case TCP_TW_SYN: {
1979                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1980                                                         &tcp_hashinfo, skb,
1981                                                         __tcp_hdrlen(th),
1982                                                         iph->saddr, th->source,
1983                                                         iph->daddr, th->dest,
1984                                                         inet_iif(skb),
1985                                                         sdif);
1986                 if (sk2) {
1987                         inet_twsk_deschedule_put(inet_twsk(sk));
1988                         sk = sk2;
1989                         tcp_v4_restore_cb(skb);
1990                         refcounted = false;
1991                         goto process;
1992                 }
1993         }
1994                 /* to ACK */
1995                 /* fall through */
1996         case TCP_TW_ACK:
1997                 tcp_v4_timewait_ack(sk, skb);
1998                 break;
1999         case TCP_TW_RST:
2000                 tcp_v4_send_reset(sk, skb);
2001                 inet_twsk_deschedule_put(inet_twsk(sk));
2002                 goto discard_it;
2003         case TCP_TW_SUCCESS:;
2004         }
2005         goto discard_it;
2006 }
2007
2008 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2009         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2010         .twsk_unique    = tcp_twsk_unique,
2011         .twsk_destructor= tcp_twsk_destructor,
2012 };
2013
2014 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2015 {
2016         struct dst_entry *dst = skb_dst(skb);
2017
2018         if (dst && dst_hold_safe(dst)) {
2019                 sk->sk_rx_dst = dst;
2020                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2021         }
2022 }
2023 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2024
2025 const struct inet_connection_sock_af_ops ipv4_specific = {
2026         .queue_xmit        = ip_queue_xmit,
2027         .send_check        = tcp_v4_send_check,
2028         .rebuild_header    = inet_sk_rebuild_header,
2029         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2030         .conn_request      = tcp_v4_conn_request,
2031         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2032         .net_header_len    = sizeof(struct iphdr),
2033         .setsockopt        = ip_setsockopt,
2034         .getsockopt        = ip_getsockopt,
2035         .addr2sockaddr     = inet_csk_addr2sockaddr,
2036         .sockaddr_len      = sizeof(struct sockaddr_in),
2037 #ifdef CONFIG_COMPAT
2038         .compat_setsockopt = compat_ip_setsockopt,
2039         .compat_getsockopt = compat_ip_getsockopt,
2040 #endif
2041         .mtu_reduced       = tcp_v4_mtu_reduced,
2042 };
2043 EXPORT_SYMBOL(ipv4_specific);
2044
2045 #ifdef CONFIG_TCP_MD5SIG
2046 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2047         .md5_lookup             = tcp_v4_md5_lookup,
2048         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2049         .md5_parse              = tcp_v4_parse_md5_keys,
2050 };
2051 #endif
2052
2053 /* NOTE: A lot of things set to zero explicitly by call to
2054  *       sk_alloc() so need not be done here.
2055  */
2056 static int tcp_v4_init_sock(struct sock *sk)
2057 {
2058         struct inet_connection_sock *icsk = inet_csk(sk);
2059
2060         tcp_init_sock(sk);
2061
2062         icsk->icsk_af_ops = &ipv4_specific;
2063
2064 #ifdef CONFIG_TCP_MD5SIG
2065         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2066 #endif
2067
2068         return 0;
2069 }
2070
2071 void tcp_v4_destroy_sock(struct sock *sk)
2072 {
2073         struct tcp_sock *tp = tcp_sk(sk);
2074
2075         trace_tcp_destroy_sock(sk);
2076
2077         tcp_clear_xmit_timers(sk);
2078
2079         tcp_cleanup_congestion_control(sk);
2080
2081         tcp_cleanup_ulp(sk);
2082
2083         /* Cleanup up the write buffer. */
2084         tcp_write_queue_purge(sk);
2085
2086         /* Check if we want to disable active TFO */
2087         tcp_fastopen_active_disable_ofo_check(sk);
2088
2089         /* Cleans up our, hopefully empty, out_of_order_queue. */
2090         skb_rbtree_purge(&tp->out_of_order_queue);
2091
2092 #ifdef CONFIG_TCP_MD5SIG
2093         /* Clean up the MD5 key list, if any */
2094         if (tp->md5sig_info) {
2095                 tcp_clear_md5_list(sk);
2096                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2097                 tp->md5sig_info = NULL;
2098         }
2099 #endif
2100
2101         /* Clean up a referenced TCP bind bucket. */
2102         if (inet_csk(sk)->icsk_bind_hash)
2103                 inet_put_port(sk);
2104
2105         BUG_ON(tp->fastopen_rsk);
2106
2107         /* If socket is aborted during connect operation */
2108         tcp_free_fastopen_req(tp);
2109         tcp_fastopen_destroy_cipher(sk);
2110         tcp_saved_syn_free(tp);
2111
2112         sk_sockets_allocated_dec(sk);
2113 }
2114 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2115
2116 #ifdef CONFIG_PROC_FS
2117 /* Proc filesystem TCP sock list dumping. */
2118
2119 /*
2120  * Get next listener socket follow cur.  If cur is NULL, get first socket
2121  * starting from bucket given in st->bucket; when st->bucket is zero the
2122  * very first socket in the hash table is returned.
2123  */
2124 static void *listening_get_next(struct seq_file *seq, void *cur)
2125 {
2126         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2127         struct tcp_iter_state *st = seq->private;
2128         struct net *net = seq_file_net(seq);
2129         struct inet_listen_hashbucket *ilb;
2130         struct sock *sk = cur;
2131
2132         if (!sk) {
2133 get_head:
2134                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2135                 spin_lock(&ilb->lock);
2136                 sk = sk_head(&ilb->head);
2137                 st->offset = 0;
2138                 goto get_sk;
2139         }
2140         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2141         ++st->num;
2142         ++st->offset;
2143
2144         sk = sk_next(sk);
2145 get_sk:
2146         sk_for_each_from(sk) {
2147                 if (!net_eq(sock_net(sk), net))
2148                         continue;
2149                 if (sk->sk_family == afinfo->family)
2150                         return sk;
2151         }
2152         spin_unlock(&ilb->lock);
2153         st->offset = 0;
2154         if (++st->bucket < INET_LHTABLE_SIZE)
2155                 goto get_head;
2156         return NULL;
2157 }
2158
2159 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2160 {
2161         struct tcp_iter_state *st = seq->private;
2162         void *rc;
2163
2164         st->bucket = 0;
2165         st->offset = 0;
2166         rc = listening_get_next(seq, NULL);
2167
2168         while (rc && *pos) {
2169                 rc = listening_get_next(seq, rc);
2170                 --*pos;
2171         }
2172         return rc;
2173 }
2174
2175 static inline bool empty_bucket(const struct tcp_iter_state *st)
2176 {
2177         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2178 }
2179
2180 /*
2181  * Get first established socket starting from bucket given in st->bucket.
2182  * If st->bucket is zero, the very first socket in the hash is returned.
2183  */
2184 static void *established_get_first(struct seq_file *seq)
2185 {
2186         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2187         struct tcp_iter_state *st = seq->private;
2188         struct net *net = seq_file_net(seq);
2189         void *rc = NULL;
2190
2191         st->offset = 0;
2192         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2193                 struct sock *sk;
2194                 struct hlist_nulls_node *node;
2195                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2196
2197                 /* Lockless fast path for the common case of empty buckets */
2198                 if (empty_bucket(st))
2199                         continue;
2200
2201                 spin_lock_bh(lock);
2202                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2203                         if (sk->sk_family != afinfo->family ||
2204                             !net_eq(sock_net(sk), net)) {
2205                                 continue;
2206                         }
2207                         rc = sk;
2208                         goto out;
2209                 }
2210                 spin_unlock_bh(lock);
2211         }
2212 out:
2213         return rc;
2214 }
2215
2216 static void *established_get_next(struct seq_file *seq, void *cur)
2217 {
2218         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2219         struct sock *sk = cur;
2220         struct hlist_nulls_node *node;
2221         struct tcp_iter_state *st = seq->private;
2222         struct net *net = seq_file_net(seq);
2223
2224         ++st->num;
2225         ++st->offset;
2226
2227         sk = sk_nulls_next(sk);
2228
2229         sk_nulls_for_each_from(sk, node) {
2230                 if (sk->sk_family == afinfo->family &&
2231                     net_eq(sock_net(sk), net))
2232                         return sk;
2233         }
2234
2235         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2236         ++st->bucket;
2237         return established_get_first(seq);
2238 }
2239
2240 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2241 {
2242         struct tcp_iter_state *st = seq->private;
2243         void *rc;
2244
2245         st->bucket = 0;
2246         rc = established_get_first(seq);
2247
2248         while (rc && pos) {
2249                 rc = established_get_next(seq, rc);
2250                 --pos;
2251         }
2252         return rc;
2253 }
2254
2255 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2256 {
2257         void *rc;
2258         struct tcp_iter_state *st = seq->private;
2259
2260         st->state = TCP_SEQ_STATE_LISTENING;
2261         rc        = listening_get_idx(seq, &pos);
2262
2263         if (!rc) {
2264                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2265                 rc        = established_get_idx(seq, pos);
2266         }
2267
2268         return rc;
2269 }
2270
2271 static void *tcp_seek_last_pos(struct seq_file *seq)
2272 {
2273         struct tcp_iter_state *st = seq->private;
2274         int offset = st->offset;
2275         int orig_num = st->num;
2276         void *rc = NULL;
2277
2278         switch (st->state) {
2279         case TCP_SEQ_STATE_LISTENING:
2280                 if (st->bucket >= INET_LHTABLE_SIZE)
2281                         break;
2282                 st->state = TCP_SEQ_STATE_LISTENING;
2283                 rc = listening_get_next(seq, NULL);
2284                 while (offset-- && rc)
2285                         rc = listening_get_next(seq, rc);
2286                 if (rc)
2287                         break;
2288                 st->bucket = 0;
2289                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2290                 /* Fallthrough */
2291         case TCP_SEQ_STATE_ESTABLISHED:
2292                 if (st->bucket > tcp_hashinfo.ehash_mask)
2293                         break;
2294                 rc = established_get_first(seq);
2295                 while (offset-- && rc)
2296                         rc = established_get_next(seq, rc);
2297         }
2298
2299         st->num = orig_num;
2300
2301         return rc;
2302 }
2303
2304 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2305 {
2306         struct tcp_iter_state *st = seq->private;
2307         void *rc;
2308
2309         if (*pos && *pos == st->last_pos) {
2310                 rc = tcp_seek_last_pos(seq);
2311                 if (rc)
2312                         goto out;
2313         }
2314
2315         st->state = TCP_SEQ_STATE_LISTENING;
2316         st->num = 0;
2317         st->bucket = 0;
2318         st->offset = 0;
2319         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2320
2321 out:
2322         st->last_pos = *pos;
2323         return rc;
2324 }
2325 EXPORT_SYMBOL(tcp_seq_start);
2326
2327 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2328 {
2329         struct tcp_iter_state *st = seq->private;
2330         void *rc = NULL;
2331
2332         if (v == SEQ_START_TOKEN) {
2333                 rc = tcp_get_idx(seq, 0);
2334                 goto out;
2335         }
2336
2337         switch (st->state) {
2338         case TCP_SEQ_STATE_LISTENING:
2339                 rc = listening_get_next(seq, v);
2340                 if (!rc) {
2341                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2342                         st->bucket = 0;
2343                         st->offset = 0;
2344                         rc        = established_get_first(seq);
2345                 }
2346                 break;
2347         case TCP_SEQ_STATE_ESTABLISHED:
2348                 rc = established_get_next(seq, v);
2349                 break;
2350         }
2351 out:
2352         ++*pos;
2353         st->last_pos = *pos;
2354         return rc;
2355 }
2356 EXPORT_SYMBOL(tcp_seq_next);
2357
2358 void tcp_seq_stop(struct seq_file *seq, void *v)
2359 {
2360         struct tcp_iter_state *st = seq->private;
2361
2362         switch (st->state) {
2363         case TCP_SEQ_STATE_LISTENING:
2364                 if (v != SEQ_START_TOKEN)
2365                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2366                 break;
2367         case TCP_SEQ_STATE_ESTABLISHED:
2368                 if (v)
2369                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2370                 break;
2371         }
2372 }
2373 EXPORT_SYMBOL(tcp_seq_stop);
2374
2375 static void get_openreq4(const struct request_sock *req,
2376                          struct seq_file *f, int i)
2377 {
2378         const struct inet_request_sock *ireq = inet_rsk(req);
2379         long delta = req->rsk_timer.expires - jiffies;
2380
2381         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2382                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2383                 i,
2384                 ireq->ir_loc_addr,
2385                 ireq->ir_num,
2386                 ireq->ir_rmt_addr,
2387                 ntohs(ireq->ir_rmt_port),
2388                 TCP_SYN_RECV,
2389                 0, 0, /* could print option size, but that is af dependent. */
2390                 1,    /* timers active (only the expire timer) */
2391                 jiffies_delta_to_clock_t(delta),
2392                 req->num_timeout,
2393                 from_kuid_munged(seq_user_ns(f),
2394                                  sock_i_uid(req->rsk_listener)),
2395                 0,  /* non standard timer */
2396                 0, /* open_requests have no inode */
2397                 0,
2398                 req);
2399 }
2400
2401 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2402 {
2403         int timer_active;
2404         unsigned long timer_expires;
2405         const struct tcp_sock *tp = tcp_sk(sk);
2406         const struct inet_connection_sock *icsk = inet_csk(sk);
2407         const struct inet_sock *inet = inet_sk(sk);
2408         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2409         __be32 dest = inet->inet_daddr;
2410         __be32 src = inet->inet_rcv_saddr;
2411         __u16 destp = ntohs(inet->inet_dport);
2412         __u16 srcp = ntohs(inet->inet_sport);
2413         int rx_queue;
2414         int state;
2415
2416         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2417             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2418             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2419                 timer_active    = 1;
2420                 timer_expires   = icsk->icsk_timeout;
2421         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2422                 timer_active    = 4;
2423                 timer_expires   = icsk->icsk_timeout;
2424         } else if (timer_pending(&sk->sk_timer)) {
2425                 timer_active    = 2;
2426                 timer_expires   = sk->sk_timer.expires;
2427         } else {
2428                 timer_active    = 0;
2429                 timer_expires = jiffies;
2430         }
2431
2432         state = inet_sk_state_load(sk);
2433         if (state == TCP_LISTEN)
2434                 rx_queue = sk->sk_ack_backlog;
2435         else
2436                 /* Because we don't lock the socket,
2437                  * we might find a transient negative value.
2438                  */
2439                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2440
2441         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2442                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2443                 i, src, srcp, dest, destp, state,
2444                 tp->write_seq - tp->snd_una,
2445                 rx_queue,
2446                 timer_active,
2447                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2448                 icsk->icsk_retransmits,
2449                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2450                 icsk->icsk_probes_out,
2451                 sock_i_ino(sk),
2452                 refcount_read(&sk->sk_refcnt), sk,
2453                 jiffies_to_clock_t(icsk->icsk_rto),
2454                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2455                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2456                 tp->snd_cwnd,
2457                 state == TCP_LISTEN ?
2458                     fastopenq->max_qlen :
2459                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2460 }
2461
2462 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2463                                struct seq_file *f, int i)
2464 {
2465         long delta = tw->tw_timer.expires - jiffies;
2466         __be32 dest, src;
2467         __u16 destp, srcp;
2468
2469         dest  = tw->tw_daddr;
2470         src   = tw->tw_rcv_saddr;
2471         destp = ntohs(tw->tw_dport);
2472         srcp  = ntohs(tw->tw_sport);
2473
2474         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2475                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2476                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2477                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2478                 refcount_read(&tw->tw_refcnt), tw);
2479 }
2480
2481 #define TMPSZ 150
2482
2483 static int tcp4_seq_show(struct seq_file *seq, void *v)
2484 {
2485         struct tcp_iter_state *st;
2486         struct sock *sk = v;
2487
2488         seq_setwidth(seq, TMPSZ - 1);
2489         if (v == SEQ_START_TOKEN) {
2490                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2491                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2492                            "inode");
2493                 goto out;
2494         }
2495         st = seq->private;
2496
2497         if (sk->sk_state == TCP_TIME_WAIT)
2498                 get_timewait4_sock(v, seq, st->num);
2499         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2500                 get_openreq4(v, seq, st->num);
2501         else
2502                 get_tcp4_sock(v, seq, st->num);
2503 out:
2504         seq_pad(seq, '\n');
2505         return 0;
2506 }
2507
2508 static const struct seq_operations tcp4_seq_ops = {
2509         .show           = tcp4_seq_show,
2510         .start          = tcp_seq_start,
2511         .next           = tcp_seq_next,
2512         .stop           = tcp_seq_stop,
2513 };
2514
2515 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2516         .family         = AF_INET,
2517 };
2518
2519 static int __net_init tcp4_proc_init_net(struct net *net)
2520 {
2521         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2522                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2523                 return -ENOMEM;
2524         return 0;
2525 }
2526
2527 static void __net_exit tcp4_proc_exit_net(struct net *net)
2528 {
2529         remove_proc_entry("tcp", net->proc_net);
2530 }
2531
2532 static struct pernet_operations tcp4_net_ops = {
2533         .init = tcp4_proc_init_net,
2534         .exit = tcp4_proc_exit_net,
2535 };
2536
2537 int __init tcp4_proc_init(void)
2538 {
2539         return register_pernet_subsys(&tcp4_net_ops);
2540 }
2541
2542 void tcp4_proc_exit(void)
2543 {
2544         unregister_pernet_subsys(&tcp4_net_ops);
2545 }
2546 #endif /* CONFIG_PROC_FS */
2547
2548 struct proto tcp_prot = {
2549         .name                   = "TCP",
2550         .owner                  = THIS_MODULE,
2551         .close                  = tcp_close,
2552         .pre_connect            = tcp_v4_pre_connect,
2553         .connect                = tcp_v4_connect,
2554         .disconnect             = tcp_disconnect,
2555         .accept                 = inet_csk_accept,
2556         .ioctl                  = tcp_ioctl,
2557         .init                   = tcp_v4_init_sock,
2558         .destroy                = tcp_v4_destroy_sock,
2559         .shutdown               = tcp_shutdown,
2560         .setsockopt             = tcp_setsockopt,
2561         .getsockopt             = tcp_getsockopt,
2562         .keepalive              = tcp_set_keepalive,
2563         .recvmsg                = tcp_recvmsg,
2564         .sendmsg                = tcp_sendmsg,
2565         .sendpage               = tcp_sendpage,
2566         .backlog_rcv            = tcp_v4_do_rcv,
2567         .release_cb             = tcp_release_cb,
2568         .hash                   = inet_hash,
2569         .unhash                 = inet_unhash,
2570         .get_port               = inet_csk_get_port,
2571         .enter_memory_pressure  = tcp_enter_memory_pressure,
2572         .leave_memory_pressure  = tcp_leave_memory_pressure,
2573         .stream_memory_free     = tcp_stream_memory_free,
2574         .sockets_allocated      = &tcp_sockets_allocated,
2575         .orphan_count           = &tcp_orphan_count,
2576         .memory_allocated       = &tcp_memory_allocated,
2577         .memory_pressure        = &tcp_memory_pressure,
2578         .sysctl_mem             = sysctl_tcp_mem,
2579         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2580         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2581         .max_header             = MAX_TCP_HEADER,
2582         .obj_size               = sizeof(struct tcp_sock),
2583         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2584         .twsk_prot              = &tcp_timewait_sock_ops,
2585         .rsk_prot               = &tcp_request_sock_ops,
2586         .h.hashinfo             = &tcp_hashinfo,
2587         .no_autobind            = true,
2588 #ifdef CONFIG_COMPAT
2589         .compat_setsockopt      = compat_tcp_setsockopt,
2590         .compat_getsockopt      = compat_tcp_getsockopt,
2591 #endif
2592         .diag_destroy           = tcp_abort,
2593 };
2594 EXPORT_SYMBOL(tcp_prot);
2595
2596 static void __net_exit tcp_sk_exit(struct net *net)
2597 {
2598         int cpu;
2599
2600         if (net->ipv4.tcp_congestion_control)
2601                 module_put(net->ipv4.tcp_congestion_control->owner);
2602
2603         for_each_possible_cpu(cpu)
2604                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2605         free_percpu(net->ipv4.tcp_sk);
2606 }
2607
2608 static int __net_init tcp_sk_init(struct net *net)
2609 {
2610         int res, cpu, cnt;
2611
2612         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2613         if (!net->ipv4.tcp_sk)
2614                 return -ENOMEM;
2615
2616         for_each_possible_cpu(cpu) {
2617                 struct sock *sk;
2618
2619                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2620                                            IPPROTO_TCP, net);
2621                 if (res)
2622                         goto fail;
2623                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2624
2625                 /* Please enforce IP_DF and IPID==0 for RST and
2626                  * ACK sent in SYN-RECV and TIME-WAIT state.
2627                  */
2628                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2629
2630                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2631         }
2632
2633         net->ipv4.sysctl_tcp_ecn = 2;
2634         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2635
2636         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2637         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2638         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2639         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2640
2641         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2642         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2643         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2644
2645         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2646         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2647         net->ipv4.sysctl_tcp_syncookies = 1;
2648         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2649         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2650         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2651         net->ipv4.sysctl_tcp_orphan_retries = 0;
2652         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2653         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2654         net->ipv4.sysctl_tcp_tw_reuse = 2;
2655
2656         cnt = tcp_hashinfo.ehash_mask + 1;
2657         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2658         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2659
2660         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2661         net->ipv4.sysctl_tcp_sack = 1;
2662         net->ipv4.sysctl_tcp_window_scaling = 1;
2663         net->ipv4.sysctl_tcp_timestamps = 1;
2664         net->ipv4.sysctl_tcp_early_retrans = 3;
2665         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2666         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2667         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2668         net->ipv4.sysctl_tcp_max_reordering = 300;
2669         net->ipv4.sysctl_tcp_dsack = 1;
2670         net->ipv4.sysctl_tcp_app_win = 31;
2671         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2672         net->ipv4.sysctl_tcp_frto = 2;
2673         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2674         /* This limits the percentage of the congestion window which we
2675          * will allow a single TSO frame to consume.  Building TSO frames
2676          * which are too large can cause TCP streams to be bursty.
2677          */
2678         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2679         /* Default TSQ limit of 16 TSO segments */
2680         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2681         /* rfc5961 challenge ack rate limiting */
2682         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2683         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2684         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2685         net->ipv4.sysctl_tcp_autocorking = 1;
2686         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2687         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2688         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2689         if (net != &init_net) {
2690                 memcpy(net->ipv4.sysctl_tcp_rmem,
2691                        init_net.ipv4.sysctl_tcp_rmem,
2692                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2693                 memcpy(net->ipv4.sysctl_tcp_wmem,
2694                        init_net.ipv4.sysctl_tcp_wmem,
2695                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2696         }
2697         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2698         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2699         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2700         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2701         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2702         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2703
2704         /* Reno is always built in */
2705         if (!net_eq(net, &init_net) &&
2706             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2707                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2708         else
2709                 net->ipv4.tcp_congestion_control = &tcp_reno;
2710
2711         return 0;
2712 fail:
2713         tcp_sk_exit(net);
2714
2715         return res;
2716 }
2717
2718 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2719 {
2720         struct net *net;
2721
2722         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2723
2724         list_for_each_entry(net, net_exit_list, exit_list)
2725                 tcp_fastopen_ctx_destroy(net);
2726 }
2727
2728 static struct pernet_operations __net_initdata tcp_sk_ops = {
2729        .init       = tcp_sk_init,
2730        .exit       = tcp_sk_exit,
2731        .exit_batch = tcp_sk_exit_batch,
2732 };
2733
2734 void __init tcp_v4_init(void)
2735 {
2736         if (register_pernet_subsys(&tcp_sk_ops))
2737                 panic("Failed to create the TCP control socket.\n");
2738 }