net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 144                               int addr_len)
 145 {
 146         /* This check is replicated from tcp_v4_connect() and intended to
 147          * prevent BPF program called below from accessing bytes that are out
 148          * of the bound specified by user in addr_len.
 149          */
 150         if (addr_len < sizeof(struct sockaddr_in))
 151                 return -EINVAL;
 152
 153         sock_owned_by_me(sk);
 154
 155         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 156 }
 157
 158 /* This will initiate an outgoing connection. */
 159 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 160 {
 161         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 162         struct inet_sock *inet = inet_sk(sk);
 163         struct tcp_sock *tp = tcp_sk(sk);
 164         __be16 orig_sport, orig_dport;
 165         __be32 daddr, nexthop;
 166         struct flowi4 *fl4;
 167         struct rtable *rt;
 168         int err;
 169         struct ip_options_rcu *inet_opt;
 170         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 171
 172         if (addr_len < sizeof(struct sockaddr_in))
 173                 return -EINVAL;
 174
 175         if (usin->sin_family != AF_INET)
 176                 return -EAFNOSUPPORT;
 177
 178         nexthop = daddr = usin->sin_addr.s_addr;
 179         inet_opt = rcu_dereference_protected(inet->inet_opt,
 180                                              lockdep_sock_is_held(sk));
 181         if (inet_opt && inet_opt->opt.srr) {
 182                 if (!daddr)
 183                         return -EINVAL;
 184                 nexthop = inet_opt->opt.faddr;
 185         }
 186
 187         orig_sport = inet->inet_sport;
 188         orig_dport = usin->sin_port;
 189         fl4 = &inet->cork.fl.u.ip4;
 190         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 191                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 192                               IPPROTO_TCP,
 193                               orig_sport, orig_dport, sk);
 194         if (IS_ERR(rt)) {
 195                 err = PTR_ERR(rt);
 196                 if (err == -ENETUNREACH)
 197                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 198                 return err;
 199         }
 200
 201         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 202                 ip_rt_put(rt);
 203                 return -ENETUNREACH;
 204         }
 205
 206         if (!inet_opt || !inet_opt->opt.srr)
 207                 daddr = fl4->daddr;
 208
 209         if (!inet->inet_saddr)
 210                 inet->inet_saddr = fl4->saddr;
 211         sk_rcv_saddr_set(sk, inet->inet_saddr);
 212
 213         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 214                 /* Reset inherited state */
 215                 tp->rx_opt.ts_recent       = 0;
 216                 tp->rx_opt.ts_recent_stamp = 0;
 217                 if (likely(!tp->repair))
 218                         tp->write_seq      = 0;
 219         }
 220
 221         inet->inet_dport = usin->sin_port;
 222         sk_daddr_set(sk, daddr);
 223
 224         inet_csk(sk)->icsk_ext_hdr_len = 0;
 225         if (inet_opt)
 226                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 227
 228         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 229
 230         /* Socket identity is still unknown (sport may be zero).
 231          * However we set state to SYN-SENT and not releasing socket
 232          * lock select source port, enter ourselves into the hash tables and
 233          * complete initialization after this.
 234          */
 235         tcp_set_state(sk, TCP_SYN_SENT);
 236         err = inet_hash_connect(tcp_death_row, sk);
 237         if (err)
 238                 goto failure;
 239
 240         sk_set_txhash(sk);
 241
 242         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 243                                inet->inet_sport, inet->inet_dport, sk);
 244         if (IS_ERR(rt)) {
 245                 err = PTR_ERR(rt);
 246                 rt = NULL;
 247                 goto failure;
 248         }
 249         /* OK, now commit destination to socket.  */
 250         sk->sk_gso_type = SKB_GSO_TCPV4;
 251         sk_setup_caps(sk, &rt->dst);
 252         rt = NULL;
 253
 254         if (likely(!tp->repair)) {
 255                 if (!tp->write_seq)
 256                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 257                                                        inet->inet_daddr,
 258                                                        inet->inet_sport,
 259                                                        usin->sin_port);
 260                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 261                                                  inet->inet_saddr,
 262                                                  inet->inet_daddr);
 263         }
 264
 265         inet->inet_id = tp->write_seq ^ jiffies;
 266
 267         if (tcp_fastopen_defer_connect(sk, &err))
 268                 return err;
 269         if (err)
 270                 goto failure;
 271
 272         err = tcp_connect(sk);
 273
 274         if (err)
 275                 goto failure;
 276
 277         return 0;
 278
 279 failure:
 280         /*
 281          * This unhashes the socket and releases the local port,
 282          * if necessary.
 283          */
 284         tcp_set_state(sk, TCP_CLOSE);
 285         ip_rt_put(rt);
 286         sk->sk_route_caps = 0;
 287         inet->inet_dport = 0;
 288         return err;
 289 }
 290 EXPORT_SYMBOL(tcp_v4_connect);
 291
 292 /*
 293  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 294  * It can be called through tcp_release_cb() if socket was owned by user
 295  * at the time tcp_v4_err() was called to handle ICMP message.
 296  */
 297 void tcp_v4_mtu_reduced(struct sock *sk)
 298 {
 299         struct inet_sock *inet = inet_sk(sk);
 300         struct dst_entry *dst;
 301         u32 mtu;
 302
 303         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 304                 return;
 305         mtu = tcp_sk(sk)->mtu_info;
 306         dst = inet_csk_update_pmtu(sk, mtu);
 307         if (!dst)
 308                 return;
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             ip_sk_accept_pmtu(sk) &&
 320             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 321                 tcp_sync_mss(sk, mtu);
 322
 323                 /* Resend the TCP packet because it's
 324                  * clear that the old packet has been
 325                  * dropped. This is the new "fast" path mtu
 326                  * discovery.
 327                  */
 328                 tcp_simple_retransmit(sk);
 329         } /* else let the usual retransmit timer handle it */
 330 }
 331 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 332
 333 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 334 {
 335         struct dst_entry *dst = __sk_dst_check(sk, 0);
 336
 337         if (dst)
 338                 dst->ops->redirect(dst, sk, skb);
 339 }
 340
 341
 342 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 343 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 344 {
 345         struct request_sock *req = inet_reqsk(sk);
 346         struct net *net = sock_net(sk);
 347
 348         /* ICMPs are not backlogged, hence we cannot get
 349          * an established socket here.
 350          */
 351         if (seq != tcp_rsk(req)->snt_isn) {
 352                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 353         } else if (abort) {
 354                 /*
 355                  * Still in SYN_RECV, just remove it silently.
 356                  * There is no good way to pass the error to the newly
 357                  * created socket, and POSIX does not want network
 358                  * errors returned from accept().
 359                  */
 360                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 361                 tcp_listendrop(req->rsk_listener);
 362         }
 363         reqsk_put(req);
 364 }
 365 EXPORT_SYMBOL(tcp_req_err);
 366
 367 /*
 368  * This routine is called by the ICMP module when it gets some
 369  * sort of error condition.  If err < 0 then the socket should
 370  * be closed and the error returned to the user.  If err > 0
 371  * it's just the icmp type << 8 | icmp code.  After adjustment
 372  * header points to the first 8 bytes of the tcp header.  We need
 373  * to find the appropriate port.
 374  *
 375  * The locking strategy used here is very "optimistic". When
 376  * someone else accesses the socket the ICMP is just dropped
 377  * and for some paths there is no check at all.
 378  * A more general error queue to queue errors for later handling
 379  * is probably better.
 380  *
 381  */
 382
 383 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 384 {
 385         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 386         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 387         struct inet_connection_sock *icsk;
 388         struct tcp_sock *tp;
 389         struct inet_sock *inet;
 390         const int type = icmp_hdr(icmp_skb)->type;
 391         const int code = icmp_hdr(icmp_skb)->code;
 392         struct sock *sk;
 393         struct sk_buff *skb;
 394         struct request_sock *fastopen;
 395         u32 seq, snd_una;
 396         s32 remaining;
 397         u32 delta_us;
 398         int err;
 399         struct net *net = dev_net(icmp_skb->dev);
 400
 401         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 402                                        th->dest, iph->saddr, ntohs(th->source),
 403                                        inet_iif(icmp_skb), 0);
 404         if (!sk) {
 405                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 406                 return;
 407         }
 408         if (sk->sk_state == TCP_TIME_WAIT) {
 409                 inet_twsk_put(inet_twsk(sk));
 410                 return;
 411         }
 412         seq = ntohl(th->seq);
 413         if (sk->sk_state == TCP_NEW_SYN_RECV)
 414                 return tcp_req_err(sk, seq,
 415                                   type == ICMP_PARAMETERPROB ||
 416                                   type == ICMP_TIME_EXCEEDED ||
 417                                   (type == ICMP_DEST_UNREACH &&
 418                                    (code == ICMP_NET_UNREACH ||
 419                                     code == ICMP_HOST_UNREACH)));
 420
 421         bh_lock_sock(sk);
 422         /* If too many ICMPs get dropped on busy
 423          * servers this needs to be solved differently.
 424          * We do take care of PMTU discovery (RFC1191) special case :
 425          * we can receive locally generated ICMP messages while socket is held.
 426          */
 427         if (sock_owned_by_user(sk)) {
 428                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 429                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 430         }
 431         if (sk->sk_state == TCP_CLOSE)
 432                 goto out;
 433
 434         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 435                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 436                 goto out;
 437         }
 438
 439         icsk = inet_csk(sk);
 440         tp = tcp_sk(sk);
 441         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 442         fastopen = tp->fastopen_rsk;
 443         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 444         if (sk->sk_state != TCP_LISTEN &&
 445             !between(seq, snd_una, tp->snd_nxt)) {
 446                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 447                 goto out;
 448         }
 449
 450         switch (type) {
 451         case ICMP_REDIRECT:
 452                 if (!sock_owned_by_user(sk))
 453                         do_redirect(icmp_skb, sk);
 454                 goto out;
 455         case ICMP_SOURCE_QUENCH:
 456                 /* Just silently ignore these. */
 457                 goto out;
 458         case ICMP_PARAMETERPROB:
 459                 err = EPROTO;
 460                 break;
 461         case ICMP_DEST_UNREACH:
 462                 if (code > NR_ICMP_UNREACH)
 463                         goto out;
 464
 465                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 466                         /* We are not interested in TCP_LISTEN and open_requests
 467                          * (SYN-ACKs send out by Linux are always <576bytes so
 468                          * they should go through unfragmented).
 469                          */
 470                         if (sk->sk_state == TCP_LISTEN)
 471                                 goto out;
 472
 473                         tp->mtu_info = info;
 474                         if (!sock_owned_by_user(sk)) {
 475                                 tcp_v4_mtu_reduced(sk);
 476                         } else {
 477                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 478                                         sock_hold(sk);
 479                         }
 480                         goto out;
 481                 }
 482
 483                 err = icmp_err_convert[code].errno;
 484                 /* check if icmp_skb allows revert of backoff
 485                  * (see draft-zimmermann-tcp-lcd) */
 486                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 487                         break;
 488                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 489                     !icsk->icsk_backoff || fastopen)
 490                         break;
 491
 492                 if (sock_owned_by_user(sk))
 493                         break;
 494
 495                 icsk->icsk_backoff--;
 496                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 497                                                TCP_TIMEOUT_INIT;
 498                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 499
 500                 skb = tcp_rtx_queue_head(sk);
 501                 BUG_ON(!skb);
 502
 503                 tcp_mstamp_refresh(tp);
 504                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 505                 remaining = icsk->icsk_rto -
 506                             usecs_to_jiffies(delta_us);
 507
 508                 if (remaining > 0) {
 509                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 510                                                   remaining, TCP_RTO_MAX);
 511                 } else {
 512                         /* RTO revert clocked out retransmission.
 513                          * Will retransmit now */
 514                         tcp_retransmit_timer(sk);
 515                 }
 516
 517                 break;
 518         case ICMP_TIME_EXCEEDED:
 519                 err = EHOSTUNREACH;
 520                 break;
 521         default:
 522                 goto out;
 523         }
 524
 525         switch (sk->sk_state) {
 526         case TCP_SYN_SENT:
 527         case TCP_SYN_RECV:
 528                 /* Only in fast or simultaneous open. If a fast open socket is
 529                  * is already accepted it is treated as a connected one below.
 530                  */
 531                 if (fastopen && !fastopen->sk)
 532                         break;
 533
 534                 if (!sock_owned_by_user(sk)) {
 535                         sk->sk_err = err;
 536
 537                         sk->sk_error_report(sk);
 538
 539                         tcp_done(sk);
 540                 } else {
 541                         sk->sk_err_soft = err;
 542                 }
 543                 goto out;
 544         }
 545
 546         /* If we've already connected we will keep trying
 547          * until we time out, or the user gives up.
 548          *
 549          * rfc1122 4.2.3.9 allows to consider as hard errors
 550          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 551          * but it is obsoleted by pmtu discovery).
 552          *
 553          * Note, that in modern internet, where routing is unreliable
 554          * and in each dark corner broken firewalls sit, sending random
 555          * errors ordered by their masters even this two messages finally lose
 556          * their original sense (even Linux sends invalid PORT_UNREACHs)
 557          *
 558          * Now we are in compliance with RFCs.
 559          *                                                      --ANK (980905)
 560          */
 561
 562         inet = inet_sk(sk);
 563         if (!sock_owned_by_user(sk) && inet->recverr) {
 564                 sk->sk_err = err;
 565                 sk->sk_error_report(sk);
 566         } else  { /* Only an error on timeout */
 567                 sk->sk_err_soft = err;
 568         }
 569
 570 out:
 571         bh_unlock_sock(sk);
 572         sock_put(sk);
 573 }
 574
 575 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 576 {
 577         struct tcphdr *th = tcp_hdr(skb);
 578
 579         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 580         skb->csum_start = skb_transport_header(skb) - skb->head;
 581         skb->csum_offset = offsetof(struct tcphdr, check);
 582 }
 583
 584 /* This routine computes an IPv4 TCP checksum. */
 585 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct inet_sock *inet = inet_sk(sk);
 588
 589         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 590 }
 591 EXPORT_SYMBOL(tcp_v4_send_check);
 592
 593 /*
 594  *      This routine will send an RST to the other tcp.
 595  *
 596  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 597  *                    for reset.
 598  *      Answer: if a packet caused RST, it is not for a socket
 599  *              existing in our system, if it is matched to a socket,
 600  *              it is just duplicate segment or bug in other side's TCP.
 601  *              So that we build reply only basing on parameters
 602  *              arrived with segment.
 603  *      Exception: precedence violation. We do not implement it in any case.
 604  */
 605
 606 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 607 {
 608         const struct tcphdr *th = tcp_hdr(skb);
 609         struct {
 610                 struct tcphdr th;
 611 #ifdef CONFIG_TCP_MD5SIG
 612                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 613 #endif
 614         } rep;
 615         struct ip_reply_arg arg;
 616 #ifdef CONFIG_TCP_MD5SIG
 617         struct tcp_md5sig_key *key = NULL;
 618         const __u8 *hash_location = NULL;
 619         unsigned char newhash[16];
 620         int genhash;
 621         struct sock *sk1 = NULL;
 622 #endif
 623         struct net *net;
 624
 625         /* Never send a reset in response to a reset. */
 626         if (th->rst)
 627                 return;
 628
 629         /* If sk not NULL, it means we did a successful lookup and incoming
 630          * route had to be correct. prequeue might have dropped our dst.
 631          */
 632         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 633                 return;
 634
 635         /* Swap the send and the receive. */
 636         memset(&rep, 0, sizeof(rep));
 637         rep.th.dest   = th->source;
 638         rep.th.source = th->dest;
 639         rep.th.doff   = sizeof(struct tcphdr) / 4;
 640         rep.th.rst    = 1;
 641
 642         if (th->ack) {
 643                 rep.th.seq = th->ack_seq;
 644         } else {
 645                 rep.th.ack = 1;
 646                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 647                                        skb->len - (th->doff << 2));
 648         }
 649
 650         memset(&arg, 0, sizeof(arg));
 651         arg.iov[0].iov_base = (unsigned char *)&rep;
 652         arg.iov[0].iov_len  = sizeof(rep.th);
 653
 654         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 655 #ifdef CONFIG_TCP_MD5SIG
 656         rcu_read_lock();
 657         hash_location = tcp_parse_md5sig_option(th);
 658         if (sk && sk_fullsock(sk)) {
 659                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 660                                         &ip_hdr(skb)->saddr, AF_INET);
 661         } else if (hash_location) {
 662                 /*
 663                  * active side is lost. Try to find listening socket through
 664                  * source port, and then find md5 key through listening socket.
 665                  * we are not loose security here:
 666                  * Incoming packet is checked with md5 hash with finding key,
 667                  * no RST generated if md5 hash doesn't match.
 668                  */
 669                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 670                                              ip_hdr(skb)->saddr,
 671                                              th->source, ip_hdr(skb)->daddr,
 672                                              ntohs(th->source), inet_iif(skb),
 673                                              tcp_v4_sdif(skb));
 674                 /* don't send rst if it can't find key */
 675                 if (!sk1)
 676                         goto out;
 677
 678                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 679                                         &ip_hdr(skb)->saddr, AF_INET);
 680                 if (!key)
 681                         goto out;
 682
 683
 684                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 685                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 686                         goto out;
 687
 688         }
 689
 690         if (key) {
 691                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 692                                    (TCPOPT_NOP << 16) |
 693                                    (TCPOPT_MD5SIG << 8) |
 694                                    TCPOLEN_MD5SIG);
 695                 /* Update length and the length the header thinks exists */
 696                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 697                 rep.th.doff = arg.iov[0].iov_len / 4;
 698
 699                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 700                                      key, ip_hdr(skb)->saddr,
 701                                      ip_hdr(skb)->daddr, &rep.th);
 702         }
 703 #endif
 704         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 705                                       ip_hdr(skb)->saddr, /* XXX */
 706                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 707         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 708         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 709
 710         /* When socket is gone, all binding information is lost.
 711          * routing might fail in this case. No choice here, if we choose to force
 712          * input interface, we will misroute in case of asymmetric route.
 713          */
 714         if (sk) {
 715                 arg.bound_dev_if = sk->sk_bound_dev_if;
 716                 if (sk_fullsock(sk))
 717                         trace_tcp_send_reset(sk, skb);
 718         }
 719
 720         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 721                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 722
 723         arg.tos = ip_hdr(skb)->tos;
 724         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 725         local_bh_disable();
 726         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 727                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 728                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 729                               &arg, arg.iov[0].iov_len);
 730
 731         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 732         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 733         local_bh_enable();
 734
 735 #ifdef CONFIG_TCP_MD5SIG
 736 out:
 737         rcu_read_unlock();
 738 #endif
 739 }
 740
 741 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 742    outside socket context is ugly, certainly. What can I do?
 743  */
 744
 745 static void tcp_v4_send_ack(const struct sock *sk,
 746                             struct sk_buff *skb, u32 seq, u32 ack,
 747                             u32 win, u32 tsval, u32 tsecr, int oif,
 748                             struct tcp_md5sig_key *key,
 749                             int reply_flags, u8 tos)
 750 {
 751         const struct tcphdr *th = tcp_hdr(skb);
 752         struct {
 753                 struct tcphdr th;
 754                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 755 #ifdef CONFIG_TCP_MD5SIG
 756                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 757 #endif
 758                         ];
 759         } rep;
 760         struct net *net = sock_net(sk);
 761         struct ip_reply_arg arg;
 762
 763         memset(&rep.th, 0, sizeof(struct tcphdr));
 764         memset(&arg, 0, sizeof(arg));
 765
 766         arg.iov[0].iov_base = (unsigned char *)&rep;
 767         arg.iov[0].iov_len  = sizeof(rep.th);
 768         if (tsecr) {
 769                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_TIMESTAMP << 8) |
 771                                    TCPOLEN_TIMESTAMP);
 772                 rep.opt[1] = htonl(tsval);
 773                 rep.opt[2] = htonl(tsecr);
 774                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 775         }
 776
 777         /* Swap the send and the receive. */
 778         rep.th.dest    = th->source;
 779         rep.th.source  = th->dest;
 780         rep.th.doff    = arg.iov[0].iov_len / 4;
 781         rep.th.seq     = htonl(seq);
 782         rep.th.ack_seq = htonl(ack);
 783         rep.th.ack     = 1;
 784         rep.th.window  = htons(win);
 785
 786 #ifdef CONFIG_TCP_MD5SIG
 787         if (key) {
 788                 int offset = (tsecr) ? 3 : 0;
 789
 790                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 791                                           (TCPOPT_NOP << 16) |
 792                                           (TCPOPT_MD5SIG << 8) |
 793                                           TCPOLEN_MD5SIG);
 794                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 795                 rep.th.doff = arg.iov[0].iov_len/4;
 796
 797                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 798                                     key, ip_hdr(skb)->saddr,
 799                                     ip_hdr(skb)->daddr, &rep.th);
 800         }
 801 #endif
 802         arg.flags = reply_flags;
 803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 804                                       ip_hdr(skb)->saddr, /* XXX */
 805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 807         if (oif)
 808                 arg.bound_dev_if = oif;
 809         arg.tos = tos;
 810         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 811         local_bh_disable();
 812         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 813                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 814                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 815                               &arg, arg.iov[0].iov_len);
 816
 817         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 818         local_bh_enable();
 819 }
 820
 821 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 822 {
 823         struct inet_timewait_sock *tw = inet_twsk(sk);
 824         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 825
 826         tcp_v4_send_ack(sk, skb,
 827                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 828                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 829                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 830                         tcptw->tw_ts_recent,
 831                         tw->tw_bound_dev_if,
 832                         tcp_twsk_md5_key(tcptw),
 833                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 834                         tw->tw_tos
 835                         );
 836
 837         inet_twsk_put(tw);
 838 }
 839
 840 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 841                                   struct request_sock *req)
 842 {
 843         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 844          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 845          */
 846         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 847                                              tcp_sk(sk)->snd_nxt;
 848
 849         /* RFC 7323 2.3
 850          * The window field (SEG.WND) of every outgoing segment, with the
 851          * exception of <SYN> segments, MUST be right-shifted by
 852          * Rcv.Wind.Shift bits:
 853          */
 854         tcp_v4_send_ack(sk, skb, seq,
 855                         tcp_rsk(req)->rcv_nxt,
 856                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 857                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 858                         req->ts_recent,
 859                         0,
 860                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 861                                           AF_INET),
 862                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 863                         ip_hdr(skb)->tos);
 864 }
 865
 866 /*
 867  *      Send a SYN-ACK after having received a SYN.
 868  *      This still operates on a request_sock only, not on a big
 869  *      socket.
 870  */
 871 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 872                               struct flowi *fl,
 873                               struct request_sock *req,
 874                               struct tcp_fastopen_cookie *foc,
 875                               enum tcp_synack_type synack_type)
 876 {
 877         const struct inet_request_sock *ireq = inet_rsk(req);
 878         struct flowi4 fl4;
 879         int err = -1;
 880         struct sk_buff *skb;
 881
 882         /* First, grab a route. */
 883         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 884                 return -1;
 885
 886         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 887
 888         if (skb) {
 889                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 890
 891                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 892                                             ireq->ir_rmt_addr,
 893                                             ireq_opt_deref(ireq));
 894                 err = net_xmit_eval(err);
 895         }
 896
 897         return err;
 898 }
 899
 900 /*
 901  *      IPv4 request_sock destructor.
 902  */
 903 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 904 {
 905         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 906 }
 907
 908 #ifdef CONFIG_TCP_MD5SIG
 909 /*
 910  * RFC2385 MD5 checksumming requires a mapping of
 911  * IP address->MD5 Key.
 912  * We need to maintain these in the sk structure.
 913  */
 914
 915 /* Find the Key structure for an address.  */
 916 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 917                                          const union tcp_md5_addr *addr,
 918                                          int family)
 919 {
 920         const struct tcp_sock *tp = tcp_sk(sk);
 921         struct tcp_md5sig_key *key;
 922         const struct tcp_md5sig_info *md5sig;
 923         __be32 mask;
 924         struct tcp_md5sig_key *best_match = NULL;
 925         bool match;
 926
 927         /* caller either holds rcu_read_lock() or socket lock */
 928         md5sig = rcu_dereference_check(tp->md5sig_info,
 929                                        lockdep_sock_is_held(sk));
 930         if (!md5sig)
 931                 return NULL;
 932
 933         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 934                 if (key->family != family)
 935                         continue;
 936
 937                 if (family == AF_INET) {
 938                         mask = inet_make_mask(key->prefixlen);
 939                         match = (key->addr.a4.s_addr & mask) ==
 940                                 (addr->a4.s_addr & mask);
 941 #if IS_ENABLED(CONFIG_IPV6)
 942                 } else if (family == AF_INET6) {
 943                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 944                                                   key->prefixlen);
 945 #endif
 946                 } else {
 947                         match = false;
 948                 }
 949
 950                 if (match && (!best_match ||
 951                               key->prefixlen > best_match->prefixlen))
 952                         best_match = key;
 953         }
 954         return best_match;
 955 }
 956 EXPORT_SYMBOL(tcp_md5_do_lookup);
 957
 958 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 959                                                       const union tcp_md5_addr *addr,
 960                                                       int family, u8 prefixlen)
 961 {
 962         const struct tcp_sock *tp = tcp_sk(sk);
 963         struct tcp_md5sig_key *key;
 964         unsigned int size = sizeof(struct in_addr);
 965         const struct tcp_md5sig_info *md5sig;
 966
 967         /* caller either holds rcu_read_lock() or socket lock */
 968         md5sig = rcu_dereference_check(tp->md5sig_info,
 969                                        lockdep_sock_is_held(sk));
 970         if (!md5sig)
 971                 return NULL;
 972 #if IS_ENABLED(CONFIG_IPV6)
 973         if (family == AF_INET6)
 974                 size = sizeof(struct in6_addr);
 975 #endif
 976         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 977                 if (key->family != family)
 978                         continue;
 979                 if (!memcmp(&key->addr, addr, size) &&
 980                     key->prefixlen == prefixlen)
 981                         return key;
 982         }
 983         return NULL;
 984 }
 985
 986 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 987                                          const struct sock *addr_sk)
 988 {
 989         const union tcp_md5_addr *addr;
 990
 991         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 992         return tcp_md5_do_lookup(sk, addr, AF_INET);
 993 }
 994 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 995
 996 /* This can be called on a newly created socket, from other files */
 997 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 998                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 999                    gfp_t gfp)
1000 {
1001         /* Add Key to the list */
1002         struct tcp_md5sig_key *key;
1003         struct tcp_sock *tp = tcp_sk(sk);
1004         struct tcp_md5sig_info *md5sig;
1005
1006         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1007         if (key) {
1008                 /* Pre-existing entry - just update that one. */
1009                 memcpy(key->key, newkey, newkeylen);
1010                 key->keylen = newkeylen;
1011                 return 0;
1012         }
1013
1014         md5sig = rcu_dereference_protected(tp->md5sig_info,
1015                                            lockdep_sock_is_held(sk));
1016         if (!md5sig) {
1017                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1018                 if (!md5sig)
1019                         return -ENOMEM;
1020
1021                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1022                 INIT_HLIST_HEAD(&md5sig->head);
1023                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1024         }
1025
1026         key = sock_kmalloc(sk, sizeof(*key), gfp);
1027         if (!key)
1028                 return -ENOMEM;
1029         if (!tcp_alloc_md5sig_pool()) {
1030                 sock_kfree_s(sk, key, sizeof(*key));
1031                 return -ENOMEM;
1032         }
1033
1034         memcpy(key->key, newkey, newkeylen);
1035         key->keylen = newkeylen;
1036         key->family = family;
1037         key->prefixlen = prefixlen;
1038         memcpy(&key->addr, addr,
1039                (family == AF_INET6) ? sizeof(struct in6_addr) :
1040                                       sizeof(struct in_addr));
1041         hlist_add_head_rcu(&key->node, &md5sig->head);
1042         return 0;
1043 }
1044 EXPORT_SYMBOL(tcp_md5_do_add);
1045
1046 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1047                    u8 prefixlen)
1048 {
1049         struct tcp_md5sig_key *key;
1050
1051         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1052         if (!key)
1053                 return -ENOENT;
1054         hlist_del_rcu(&key->node);
1055         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056         kfree_rcu(key, rcu);
1057         return 0;
1058 }
1059 EXPORT_SYMBOL(tcp_md5_do_del);
1060
1061 static void tcp_clear_md5_list(struct sock *sk)
1062 {
1063         struct tcp_sock *tp = tcp_sk(sk);
1064         struct tcp_md5sig_key *key;
1065         struct hlist_node *n;
1066         struct tcp_md5sig_info *md5sig;
1067
1068         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1069
1070         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1071                 hlist_del_rcu(&key->node);
1072                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1073                 kfree_rcu(key, rcu);
1074         }
1075 }
1076
1077 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1078                                  char __user *optval, int optlen)
1079 {
1080         struct tcp_md5sig cmd;
1081         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1082         u8 prefixlen = 32;
1083
1084         if (optlen < sizeof(cmd))
1085                 return -EINVAL;
1086
1087         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1088                 return -EFAULT;
1089
1090         if (sin->sin_family != AF_INET)
1091                 return -EINVAL;
1092
1093         if (optname == TCP_MD5SIG_EXT &&
1094             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1095                 prefixlen = cmd.tcpm_prefixlen;
1096                 if (prefixlen > 32)
1097                         return -EINVAL;
1098         }
1099
1100         if (!cmd.tcpm_keylen)
1101                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1102                                       AF_INET, prefixlen);
1103
1104         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1105                 return -EINVAL;
1106
1107         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1109                               GFP_KERNEL);
1110 }
1111
1112 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1113                                    __be32 daddr, __be32 saddr,
1114                                    const struct tcphdr *th, int nbytes)
1115 {
1116         struct tcp4_pseudohdr *bp;
1117         struct scatterlist sg;
1118         struct tcphdr *_th;
1119
1120         bp = hp->scratch;
1121         bp->saddr = saddr;
1122         bp->daddr = daddr;
1123         bp->pad = 0;
1124         bp->protocol = IPPROTO_TCP;
1125         bp->len = cpu_to_be16(nbytes);
1126
1127         _th = (struct tcphdr *)(bp + 1);
1128         memcpy(_th, th, sizeof(*th));
1129         _th->check = 0;
1130
1131         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1132         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1133                                 sizeof(*bp) + sizeof(*th));
1134         return crypto_ahash_update(hp->md5_req);
1135 }
1136
1137 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1138                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1139 {
1140         struct tcp_md5sig_pool *hp;
1141         struct ahash_request *req;
1142
1143         hp = tcp_get_md5sig_pool();
1144         if (!hp)
1145                 goto clear_hash_noput;
1146         req = hp->md5_req;
1147
1148         if (crypto_ahash_init(req))
1149                 goto clear_hash;
1150         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1151                 goto clear_hash;
1152         if (tcp_md5_hash_key(hp, key))
1153                 goto clear_hash;
1154         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1155         if (crypto_ahash_final(req))
1156                 goto clear_hash;
1157
1158         tcp_put_md5sig_pool();
1159         return 0;
1160
1161 clear_hash:
1162         tcp_put_md5sig_pool();
1163 clear_hash_noput:
1164         memset(md5_hash, 0, 16);
1165         return 1;
1166 }
1167
1168 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1169                         const struct sock *sk,
1170                         const struct sk_buff *skb)
1171 {
1172         struct tcp_md5sig_pool *hp;
1173         struct ahash_request *req;
1174         const struct tcphdr *th = tcp_hdr(skb);
1175         __be32 saddr, daddr;
1176
1177         if (sk) { /* valid for establish/request sockets */
1178                 saddr = sk->sk_rcv_saddr;
1179                 daddr = sk->sk_daddr;
1180         } else {
1181                 const struct iphdr *iph = ip_hdr(skb);
1182                 saddr = iph->saddr;
1183                 daddr = iph->daddr;
1184         }
1185
1186         hp = tcp_get_md5sig_pool();
1187         if (!hp)
1188                 goto clear_hash_noput;
1189         req = hp->md5_req;
1190
1191         if (crypto_ahash_init(req))
1192                 goto clear_hash;
1193
1194         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1195                 goto clear_hash;
1196         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1197                 goto clear_hash;
1198         if (tcp_md5_hash_key(hp, key))
1199                 goto clear_hash;
1200         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1201         if (crypto_ahash_final(req))
1202                 goto clear_hash;
1203
1204         tcp_put_md5sig_pool();
1205         return 0;
1206
1207 clear_hash:
1208         tcp_put_md5sig_pool();
1209 clear_hash_noput:
1210         memset(md5_hash, 0, 16);
1211         return 1;
1212 }
1213 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1214
1215 #endif
1216
1217 /* Called with rcu_read_lock() */
1218 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1219                                     const struct sk_buff *skb)
1220 {
1221 #ifdef CONFIG_TCP_MD5SIG
1222         /*
1223          * This gets called for each TCP segment that arrives
1224          * so we want to be efficient.
1225          * We have 3 drop cases:
1226          * o No MD5 hash and one expected.
1227          * o MD5 hash and we're not expecting one.
1228          * o MD5 hash and its wrong.
1229          */
1230         const __u8 *hash_location = NULL;
1231         struct tcp_md5sig_key *hash_expected;
1232         const struct iphdr *iph = ip_hdr(skb);
1233         const struct tcphdr *th = tcp_hdr(skb);
1234         int genhash;
1235         unsigned char newhash[16];
1236
1237         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1238                                           AF_INET);
1239         hash_location = tcp_parse_md5sig_option(th);
1240
1241         /* We've parsed the options - do we have a hash? */
1242         if (!hash_expected && !hash_location)
1243                 return false;
1244
1245         if (hash_expected && !hash_location) {
1246                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1247                 return true;
1248         }
1249
1250         if (!hash_expected && hash_location) {
1251                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1252                 return true;
1253         }
1254
1255         /* Okay, so this is hash_expected and hash_location -
1256          * so we need to calculate the checksum.
1257          */
1258         genhash = tcp_v4_md5_hash_skb(newhash,
1259                                       hash_expected,
1260                                       NULL, skb);
1261
1262         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1263                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1264                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265                                      &iph->saddr, ntohs(th->source),
1266                                      &iph->daddr, ntohs(th->dest),
1267                                      genhash ? " tcp_v4_calc_md5_hash failed"
1268                                      : "");
1269                 return true;
1270         }
1271         return false;
1272 #endif
1273         return false;
1274 }
1275
1276 static void tcp_v4_init_req(struct request_sock *req,
1277                             const struct sock *sk_listener,
1278                             struct sk_buff *skb)
1279 {
1280         struct inet_request_sock *ireq = inet_rsk(req);
1281         struct net *net = sock_net(sk_listener);
1282
1283         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1284         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1285         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1286 }
1287
1288 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1289                                           struct flowi *fl,
1290                                           const struct request_sock *req)
1291 {
1292         return inet_csk_route_req(sk, &fl->u.ip4, req);
1293 }
1294
1295 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1296         .family         =       PF_INET,
1297         .obj_size       =       sizeof(struct tcp_request_sock),
1298         .rtx_syn_ack    =       tcp_rtx_synack,
1299         .send_ack       =       tcp_v4_reqsk_send_ack,
1300         .destructor     =       tcp_v4_reqsk_destructor,
1301         .send_reset     =       tcp_v4_send_reset,
1302         .syn_ack_timeout =      tcp_syn_ack_timeout,
1303 };
1304
1305 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1306         .mss_clamp      =       TCP_MSS_DEFAULT,
1307 #ifdef CONFIG_TCP_MD5SIG
1308         .req_md5_lookup =       tcp_v4_md5_lookup,
1309         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1310 #endif
1311         .init_req       =       tcp_v4_init_req,
1312 #ifdef CONFIG_SYN_COOKIES
1313         .cookie_init_seq =      cookie_v4_init_sequence,
1314 #endif
1315         .route_req      =       tcp_v4_route_req,
1316         .init_seq       =       tcp_v4_init_seq,
1317         .init_ts_off    =       tcp_v4_init_ts_off,
1318         .send_synack    =       tcp_v4_send_synack,
1319 };
1320
1321 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1322 {
1323         /* Never answer to SYNs send to broadcast or multicast */
1324         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1325                 goto drop;
1326
1327         return tcp_conn_request(&tcp_request_sock_ops,
1328                                 &tcp_request_sock_ipv4_ops, sk, skb);
1329
1330 drop:
1331         tcp_listendrop(sk);
1332         return 0;
1333 }
1334 EXPORT_SYMBOL(tcp_v4_conn_request);
1335
1336
1337 /*
1338  * The three way handshake has completed - we got a valid synack -
1339  * now create the new socket.
1340  */
1341 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1342                                   struct request_sock *req,
1343                                   struct dst_entry *dst,
1344                                   struct request_sock *req_unhash,
1345                                   bool *own_req)
1346 {
1347         struct inet_request_sock *ireq;
1348         struct inet_sock *newinet;
1349         struct tcp_sock *newtp;
1350         struct sock *newsk;
1351 #ifdef CONFIG_TCP_MD5SIG
1352         struct tcp_md5sig_key *key;
1353 #endif
1354         struct ip_options_rcu *inet_opt;
1355
1356         if (sk_acceptq_is_full(sk))
1357                 goto exit_overflow;
1358
1359         newsk = tcp_create_openreq_child(sk, req, skb);
1360         if (!newsk)
1361                 goto exit_nonewsk;
1362
1363         newsk->sk_gso_type = SKB_GSO_TCPV4;
1364         inet_sk_rx_dst_set(newsk, skb);
1365
1366         newtp                 = tcp_sk(newsk);
1367         newinet               = inet_sk(newsk);
1368         ireq                  = inet_rsk(req);
1369         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1370         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1371         newsk->sk_bound_dev_if = ireq->ir_iif;
1372         newinet->inet_saddr   = ireq->ir_loc_addr;
1373         inet_opt              = rcu_dereference(ireq->ireq_opt);
1374         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1375         newinet->mc_index     = inet_iif(skb);
1376         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1377         newinet->rcv_tos      = ip_hdr(skb)->tos;
1378         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1379         if (inet_opt)
1380                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1381         newinet->inet_id = newtp->write_seq ^ jiffies;
1382
1383         if (!dst) {
1384                 dst = inet_csk_route_child_sock(sk, newsk, req);
1385                 if (!dst)
1386                         goto put_and_exit;
1387         } else {
1388                 /* syncookie case : see end of cookie_v4_check() */
1389         }
1390         sk_setup_caps(newsk, dst);
1391
1392         tcp_ca_openreq_child(newsk, dst);
1393
1394         tcp_sync_mss(newsk, dst_mtu(dst));
1395         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1396
1397         tcp_initialize_rcv_mss(newsk);
1398
1399 #ifdef CONFIG_TCP_MD5SIG
1400         /* Copy over the MD5 key from the original socket */
1401         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402                                 AF_INET);
1403         if (key) {
1404                 /*
1405                  * We're using one, so create a matching key
1406                  * on the newsk structure. If we fail to get
1407                  * memory, then we end up not copying the key
1408                  * across. Shucks.
1409                  */
1410                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1411                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1412                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1413         }
1414 #endif
1415
1416         if (__inet_inherit_port(sk, newsk) < 0)
1417                 goto put_and_exit;
1418         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1419         if (likely(*own_req)) {
1420                 tcp_move_syn(newtp, req);
1421                 ireq->ireq_opt = NULL;
1422         } else {
1423                 newinet->inet_opt = NULL;
1424         }
1425         return newsk;
1426
1427 exit_overflow:
1428         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1429 exit_nonewsk:
1430         dst_release(dst);
1431 exit:
1432         tcp_listendrop(sk);
1433         return NULL;
1434 put_and_exit:
1435         newinet->inet_opt = NULL;
1436         inet_csk_prepare_forced_close(newsk);
1437         tcp_done(newsk);
1438         goto exit;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1441
1442 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1443 {
1444 #ifdef CONFIG_SYN_COOKIES
1445         const struct tcphdr *th = tcp_hdr(skb);
1446
1447         if (!th->syn)
1448                 sk = cookie_v4_check(sk, skb);
1449 #endif
1450         return sk;
1451 }
1452
1453 /* The socket must have it's spinlock held when we get
1454  * here, unless it is a TCP_LISTEN socket.
1455  *
1456  * We have a potential double-lock case here, so even when
1457  * doing backlog processing we use the BH locking scheme.
1458  * This is because we cannot sleep with the original spinlock
1459  * held.
1460  */
1461 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1462 {
1463         struct sock *rsk;
1464
1465         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1466                 struct dst_entry *dst = sk->sk_rx_dst;
1467
1468                 sock_rps_save_rxhash(sk, skb);
1469                 sk_mark_napi_id(sk, skb);
1470                 if (dst) {
1471                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1472                             !dst->ops->check(dst, 0)) {
1473                                 dst_release(dst);
1474                                 sk->sk_rx_dst = NULL;
1475                         }
1476                 }
1477                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1478                 return 0;
1479         }
1480
1481         if (tcp_checksum_complete(skb))
1482                 goto csum_err;
1483
1484         if (sk->sk_state == TCP_LISTEN) {
1485                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1486
1487                 if (!nsk)
1488                         goto discard;
1489                 if (nsk != sk) {
1490                         if (tcp_child_process(sk, nsk, skb)) {
1491                                 rsk = nsk;
1492                                 goto reset;
1493                         }
1494                         return 0;
1495                 }
1496         } else
1497                 sock_rps_save_rxhash(sk, skb);
1498
1499         if (tcp_rcv_state_process(sk, skb)) {
1500                 rsk = sk;
1501                 goto reset;
1502         }
1503         return 0;
1504
1505 reset:
1506         tcp_v4_send_reset(rsk, skb);
1507 discard:
1508         kfree_skb(skb);
1509         /* Be careful here. If this function gets more complicated and
1510          * gcc suffers from register pressure on the x86, sk (in %ebx)
1511          * might be destroyed here. This current version compiles correctly,
1512          * but you have been warned.
1513          */
1514         return 0;
1515
1516 csum_err:
1517         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1518         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1519         goto discard;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_do_rcv);
1522
1523 int tcp_v4_early_demux(struct sk_buff *skb)
1524 {
1525         const struct iphdr *iph;
1526         const struct tcphdr *th;
1527         struct sock *sk;
1528
1529         if (skb->pkt_type != PACKET_HOST)
1530                 return 0;
1531
1532         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1533                 return 0;
1534
1535         iph = ip_hdr(skb);
1536         th = tcp_hdr(skb);
1537
1538         if (th->doff < sizeof(struct tcphdr) / 4)
1539                 return 0;
1540
1541         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1542                                        iph->saddr, th->source,
1543                                        iph->daddr, ntohs(th->dest),
1544                                        skb->skb_iif, inet_sdif(skb));
1545         if (sk) {
1546                 skb->sk = sk;
1547                 skb->destructor = sock_edemux;
1548                 if (sk_fullsock(sk)) {
1549                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1550
1551                         if (dst)
1552                                 dst = dst_check(dst, 0);
1553                         if (dst &&
1554                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1555                                 skb_dst_set_noref(skb, dst);
1556                 }
1557         }
1558         return 0;
1559 }
1560
1561 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1562 {
1563         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1564
1565         /* Only socket owner can try to collapse/prune rx queues
1566          * to reduce memory overhead, so add a little headroom here.
1567          * Few sockets backlog are possibly concurrently non empty.
1568          */
1569         limit += 64*1024;
1570
1571         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1572          * we can fix skb->truesize to its real value to avoid future drops.
1573          * This is valid because skb is not yet charged to the socket.
1574          * It has been noticed pure SACK packets were sometimes dropped
1575          * (if cooked by drivers without copybreak feature).
1576          */
1577         skb_condense(skb);
1578
1579         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1580                 bh_unlock_sock(sk);
1581                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1582                 return true;
1583         }
1584         return false;
1585 }
1586 EXPORT_SYMBOL(tcp_add_backlog);
1587
1588 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1589 {
1590         struct tcphdr *th = (struct tcphdr *)skb->data;
1591         unsigned int eaten = skb->len;
1592         int err;
1593
1594         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1595         if (!err) {
1596                 eaten -= skb->len;
1597                 TCP_SKB_CB(skb)->end_seq -= eaten;
1598         }
1599         return err;
1600 }
1601 EXPORT_SYMBOL(tcp_filter);
1602
1603 static void tcp_v4_restore_cb(struct sk_buff *skb)
1604 {
1605         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1606                 sizeof(struct inet_skb_parm));
1607 }
1608
1609 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1610                            const struct tcphdr *th)
1611 {
1612         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1613          * barrier() makes sure compiler wont play fool^Waliasing games.
1614          */
1615         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1616                 sizeof(struct inet_skb_parm));
1617         barrier();
1618
1619         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1620         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1621                                     skb->len - th->doff * 4);
1622         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1623         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1624         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1625         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1626         TCP_SKB_CB(skb)->sacked  = 0;
1627         TCP_SKB_CB(skb)->has_rxtstamp =
1628                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1629 }
1630
1631 /*
1632  *      From tcp_input.c
1633  */
1634
1635 int tcp_v4_rcv(struct sk_buff *skb)
1636 {
1637         struct net *net = dev_net(skb->dev);
1638         int sdif = inet_sdif(skb);
1639         const struct iphdr *iph;
1640         const struct tcphdr *th;
1641         bool refcounted;
1642         struct sock *sk;
1643         int ret;
1644
1645         if (skb->pkt_type != PACKET_HOST)
1646                 goto discard_it;
1647
1648         /* Count it even if it's bad */
1649         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1650
1651         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1652                 goto discard_it;
1653
1654         th = (const struct tcphdr *)skb->data;
1655
1656         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1657                 goto bad_packet;
1658         if (!pskb_may_pull(skb, th->doff * 4))
1659                 goto discard_it;
1660
1661         /* An explanation is required here, I think.
1662          * Packet length and doff are validated by header prediction,
1663          * provided case of th->doff==0 is eliminated.
1664          * So, we defer the checks. */
1665
1666         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1667                 goto csum_error;
1668
1669         th = (const struct tcphdr *)skb->data;
1670         iph = ip_hdr(skb);
1671 lookup:
1672         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1673                                th->dest, sdif, &refcounted);
1674         if (!sk)
1675                 goto no_tcp_socket;
1676
1677 process:
1678         if (sk->sk_state == TCP_TIME_WAIT)
1679                 goto do_time_wait;
1680
1681         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1682                 struct request_sock *req = inet_reqsk(sk);
1683                 bool req_stolen = false;
1684                 struct sock *nsk;
1685
1686                 sk = req->rsk_listener;
1687                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1688                         sk_drops_add(sk, skb);
1689                         reqsk_put(req);
1690                         goto discard_it;
1691                 }
1692                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1693                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1694                         goto lookup;
1695                 }
1696                 /* We own a reference on the listener, increase it again
1697                  * as we might lose it too soon.
1698                  */
1699                 sock_hold(sk);
1700                 refcounted = true;
1701                 nsk = NULL;
1702                 if (!tcp_filter(sk, skb)) {
1703                         th = (const struct tcphdr *)skb->data;
1704                         iph = ip_hdr(skb);
1705                         tcp_v4_fill_cb(skb, iph, th);
1706                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1707                 }
1708                 if (!nsk) {
1709                         reqsk_put(req);
1710                         if (req_stolen) {
1711                                 /* Another cpu got exclusive access to req
1712                                  * and created a full blown socket.
1713                                  * Try to feed this packet to this socket
1714                                  * instead of discarding it.
1715                                  */
1716                                 tcp_v4_restore_cb(skb);
1717                                 sock_put(sk);
1718                                 goto lookup;
1719                         }
1720                         goto discard_and_relse;
1721                 }
1722                 if (nsk == sk) {
1723                         reqsk_put(req);
1724                         tcp_v4_restore_cb(skb);
1725                 } else if (tcp_child_process(sk, nsk, skb)) {
1726                         tcp_v4_send_reset(nsk, skb);
1727                         goto discard_and_relse;
1728                 } else {
1729                         sock_put(sk);
1730                         return 0;
1731                 }
1732         }
1733         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1734                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1735                 goto discard_and_relse;
1736         }
1737
1738         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1739                 goto discard_and_relse;
1740
1741         if (tcp_v4_inbound_md5_hash(sk, skb))
1742                 goto discard_and_relse;
1743
1744         nf_reset(skb);
1745
1746         if (tcp_filter(sk, skb))
1747                 goto discard_and_relse;
1748         th = (const struct tcphdr *)skb->data;
1749         iph = ip_hdr(skb);
1750         tcp_v4_fill_cb(skb, iph, th);
1751
1752         skb->dev = NULL;
1753
1754         if (sk->sk_state == TCP_LISTEN) {
1755                 ret = tcp_v4_do_rcv(sk, skb);
1756                 goto put_and_return;
1757         }
1758
1759         sk_incoming_cpu_update(sk);
1760
1761         bh_lock_sock_nested(sk);
1762         tcp_segs_in(tcp_sk(sk), skb);
1763         ret = 0;
1764         if (!sock_owned_by_user(sk)) {
1765                 ret = tcp_v4_do_rcv(sk, skb);
1766         } else if (tcp_add_backlog(sk, skb)) {
1767                 goto discard_and_relse;
1768         }
1769         bh_unlock_sock(sk);
1770
1771 put_and_return:
1772         if (refcounted)
1773                 sock_put(sk);
1774
1775         return ret;
1776
1777 no_tcp_socket:
1778         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1779                 goto discard_it;
1780
1781         tcp_v4_fill_cb(skb, iph, th);
1782
1783         if (tcp_checksum_complete(skb)) {
1784 csum_error:
1785                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1786 bad_packet:
1787                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1788         } else {
1789                 tcp_v4_send_reset(NULL, skb);
1790         }
1791
1792 discard_it:
1793         /* Discard frame. */
1794         kfree_skb(skb);
1795         return 0;
1796
1797 discard_and_relse:
1798         sk_drops_add(sk, skb);
1799         if (refcounted)
1800                 sock_put(sk);
1801         goto discard_it;
1802
1803 do_time_wait:
1804         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1805                 inet_twsk_put(inet_twsk(sk));
1806                 goto discard_it;
1807         }
1808
1809         tcp_v4_fill_cb(skb, iph, th);
1810
1811         if (tcp_checksum_complete(skb)) {
1812                 inet_twsk_put(inet_twsk(sk));
1813                 goto csum_error;
1814         }
1815         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1816         case TCP_TW_SYN: {
1817                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1818                                                         &tcp_hashinfo, skb,
1819                                                         __tcp_hdrlen(th),
1820                                                         iph->saddr, th->source,
1821                                                         iph->daddr, th->dest,
1822                                                         inet_iif(skb),
1823                                                         sdif);
1824                 if (sk2) {
1825                         inet_twsk_deschedule_put(inet_twsk(sk));
1826                         sk = sk2;
1827                         tcp_v4_restore_cb(skb);
1828                         refcounted = false;
1829                         goto process;
1830                 }
1831         }
1832                 /* to ACK */
1833                 /* fall through */
1834         case TCP_TW_ACK:
1835                 tcp_v4_timewait_ack(sk, skb);
1836                 break;
1837         case TCP_TW_RST:
1838                 tcp_v4_send_reset(sk, skb);
1839                 inet_twsk_deschedule_put(inet_twsk(sk));
1840                 goto discard_it;
1841         case TCP_TW_SUCCESS:;
1842         }
1843         goto discard_it;
1844 }
1845
1846 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1847         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1848         .twsk_unique    = tcp_twsk_unique,
1849         .twsk_destructor= tcp_twsk_destructor,
1850 };
1851
1852 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1853 {
1854         struct dst_entry *dst = skb_dst(skb);
1855
1856         if (dst && dst_hold_safe(dst)) {
1857                 sk->sk_rx_dst = dst;
1858                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1859         }
1860 }
1861 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1862
1863 const struct inet_connection_sock_af_ops ipv4_specific = {
1864         .queue_xmit        = ip_queue_xmit,
1865         .send_check        = tcp_v4_send_check,
1866         .rebuild_header    = inet_sk_rebuild_header,
1867         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1868         .conn_request      = tcp_v4_conn_request,
1869         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1870         .net_header_len    = sizeof(struct iphdr),
1871         .setsockopt        = ip_setsockopt,
1872         .getsockopt        = ip_getsockopt,
1873         .addr2sockaddr     = inet_csk_addr2sockaddr,
1874         .sockaddr_len      = sizeof(struct sockaddr_in),
1875 #ifdef CONFIG_COMPAT
1876         .compat_setsockopt = compat_ip_setsockopt,
1877         .compat_getsockopt = compat_ip_getsockopt,
1878 #endif
1879         .mtu_reduced       = tcp_v4_mtu_reduced,
1880 };
1881 EXPORT_SYMBOL(ipv4_specific);
1882
1883 #ifdef CONFIG_TCP_MD5SIG
1884 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1885         .md5_lookup             = tcp_v4_md5_lookup,
1886         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1887         .md5_parse              = tcp_v4_parse_md5_keys,
1888 };
1889 #endif
1890
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892  *       sk_alloc() so need not be done here.
1893  */
1894 static int tcp_v4_init_sock(struct sock *sk)
1895 {
1896         struct inet_connection_sock *icsk = inet_csk(sk);
1897
1898         tcp_init_sock(sk);
1899
1900         icsk->icsk_af_ops = &ipv4_specific;
1901
1902 #ifdef CONFIG_TCP_MD5SIG
1903         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1904 #endif
1905
1906         return 0;
1907 }
1908
1909 void tcp_v4_destroy_sock(struct sock *sk)
1910 {
1911         struct tcp_sock *tp = tcp_sk(sk);
1912
1913         trace_tcp_destroy_sock(sk);
1914
1915         tcp_clear_xmit_timers(sk);
1916
1917         tcp_cleanup_congestion_control(sk);
1918
1919         tcp_cleanup_ulp(sk);
1920
1921         /* Cleanup up the write buffer. */
1922         tcp_write_queue_purge(sk);
1923
1924         /* Check if we want to disable active TFO */
1925         tcp_fastopen_active_disable_ofo_check(sk);
1926
1927         /* Cleans up our, hopefully empty, out_of_order_queue. */
1928         skb_rbtree_purge(&tp->out_of_order_queue);
1929
1930 #ifdef CONFIG_TCP_MD5SIG
1931         /* Clean up the MD5 key list, if any */
1932         if (tp->md5sig_info) {
1933                 tcp_clear_md5_list(sk);
1934                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1935                 tp->md5sig_info = NULL;
1936         }
1937 #endif
1938
1939         /* Clean up a referenced TCP bind bucket. */
1940         if (inet_csk(sk)->icsk_bind_hash)
1941                 inet_put_port(sk);
1942
1943         BUG_ON(tp->fastopen_rsk);
1944
1945         /* If socket is aborted during connect operation */
1946         tcp_free_fastopen_req(tp);
1947         tcp_fastopen_destroy_cipher(sk);
1948         tcp_saved_syn_free(tp);
1949
1950         sk_sockets_allocated_dec(sk);
1951 }
1952 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954 #ifdef CONFIG_PROC_FS
1955 /* Proc filesystem TCP sock list dumping. */
1956
1957 /*
1958  * Get next listener socket follow cur.  If cur is NULL, get first socket
1959  * starting from bucket given in st->bucket; when st->bucket is zero the
1960  * very first socket in the hash table is returned.
1961  */
1962 static void *listening_get_next(struct seq_file *seq, void *cur)
1963 {
1964         struct tcp_iter_state *st = seq->private;
1965         struct net *net = seq_file_net(seq);
1966         struct inet_listen_hashbucket *ilb;
1967         struct sock *sk = cur;
1968
1969         if (!sk) {
1970 get_head:
1971                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1972                 spin_lock(&ilb->lock);
1973                 sk = sk_head(&ilb->head);
1974                 st->offset = 0;
1975                 goto get_sk;
1976         }
1977         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1978         ++st->num;
1979         ++st->offset;
1980
1981         sk = sk_next(sk);
1982 get_sk:
1983         sk_for_each_from(sk) {
1984                 if (!net_eq(sock_net(sk), net))
1985                         continue;
1986                 if (sk->sk_family == st->family)
1987                         return sk;
1988         }
1989         spin_unlock(&ilb->lock);
1990         st->offset = 0;
1991         if (++st->bucket < INET_LHTABLE_SIZE)
1992                 goto get_head;
1993         return NULL;
1994 }
1995
1996 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1997 {
1998         struct tcp_iter_state *st = seq->private;
1999         void *rc;
2000
2001         st->bucket = 0;
2002         st->offset = 0;
2003         rc = listening_get_next(seq, NULL);
2004
2005         while (rc && *pos) {
2006                 rc = listening_get_next(seq, rc);
2007                 --*pos;
2008         }
2009         return rc;
2010 }
2011
2012 static inline bool empty_bucket(const struct tcp_iter_state *st)
2013 {
2014         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2015 }
2016
2017 /*
2018  * Get first established socket starting from bucket given in st->bucket.
2019  * If st->bucket is zero, the very first socket in the hash is returned.
2020  */
2021 static void *established_get_first(struct seq_file *seq)
2022 {
2023         struct tcp_iter_state *st = seq->private;
2024         struct net *net = seq_file_net(seq);
2025         void *rc = NULL;
2026
2027         st->offset = 0;
2028         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2029                 struct sock *sk;
2030                 struct hlist_nulls_node *node;
2031                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2032
2033                 /* Lockless fast path for the common case of empty buckets */
2034                 if (empty_bucket(st))
2035                         continue;
2036
2037                 spin_lock_bh(lock);
2038                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2039                         if (sk->sk_family != st->family ||
2040                             !net_eq(sock_net(sk), net)) {
2041                                 continue;
2042                         }
2043                         rc = sk;
2044                         goto out;
2045                 }
2046                 spin_unlock_bh(lock);
2047         }
2048 out:
2049         return rc;
2050 }
2051
2052 static void *established_get_next(struct seq_file *seq, void *cur)
2053 {
2054         struct sock *sk = cur;
2055         struct hlist_nulls_node *node;
2056         struct tcp_iter_state *st = seq->private;
2057         struct net *net = seq_file_net(seq);
2058
2059         ++st->num;
2060         ++st->offset;
2061
2062         sk = sk_nulls_next(sk);
2063
2064         sk_nulls_for_each_from(sk, node) {
2065                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2066                         return sk;
2067         }
2068
2069         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2070         ++st->bucket;
2071         return established_get_first(seq);
2072 }
2073
2074 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2075 {
2076         struct tcp_iter_state *st = seq->private;
2077         void *rc;
2078
2079         st->bucket = 0;
2080         rc = established_get_first(seq);
2081
2082         while (rc && pos) {
2083                 rc = established_get_next(seq, rc);
2084                 --pos;
2085         }
2086         return rc;
2087 }
2088
2089 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2090 {
2091         void *rc;
2092         struct tcp_iter_state *st = seq->private;
2093
2094         st->state = TCP_SEQ_STATE_LISTENING;
2095         rc        = listening_get_idx(seq, &pos);
2096
2097         if (!rc) {
2098                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2099                 rc        = established_get_idx(seq, pos);
2100         }
2101
2102         return rc;
2103 }
2104
2105 static void *tcp_seek_last_pos(struct seq_file *seq)
2106 {
2107         struct tcp_iter_state *st = seq->private;
2108         int offset = st->offset;
2109         int orig_num = st->num;
2110         void *rc = NULL;
2111
2112         switch (st->state) {
2113         case TCP_SEQ_STATE_LISTENING:
2114                 if (st->bucket >= INET_LHTABLE_SIZE)
2115                         break;
2116                 st->state = TCP_SEQ_STATE_LISTENING;
2117                 rc = listening_get_next(seq, NULL);
2118                 while (offset-- && rc)
2119                         rc = listening_get_next(seq, rc);
2120                 if (rc)
2121                         break;
2122                 st->bucket = 0;
2123                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2124                 /* Fallthrough */
2125         case TCP_SEQ_STATE_ESTABLISHED:
2126                 if (st->bucket > tcp_hashinfo.ehash_mask)
2127                         break;
2128                 rc = established_get_first(seq);
2129                 while (offset-- && rc)
2130                         rc = established_get_next(seq, rc);
2131         }
2132
2133         st->num = orig_num;
2134
2135         return rc;
2136 }
2137
2138 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2139 {
2140         struct tcp_iter_state *st = seq->private;
2141         void *rc;
2142
2143         if (*pos && *pos == st->last_pos) {
2144                 rc = tcp_seek_last_pos(seq);
2145                 if (rc)
2146                         goto out;
2147         }
2148
2149         st->state = TCP_SEQ_STATE_LISTENING;
2150         st->num = 0;
2151         st->bucket = 0;
2152         st->offset = 0;
2153         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154
2155 out:
2156         st->last_pos = *pos;
2157         return rc;
2158 }
2159
2160 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161 {
2162         struct tcp_iter_state *st = seq->private;
2163         void *rc = NULL;
2164
2165         if (v == SEQ_START_TOKEN) {
2166                 rc = tcp_get_idx(seq, 0);
2167                 goto out;
2168         }
2169
2170         switch (st->state) {
2171         case TCP_SEQ_STATE_LISTENING:
2172                 rc = listening_get_next(seq, v);
2173                 if (!rc) {
2174                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2175                         st->bucket = 0;
2176                         st->offset = 0;
2177                         rc        = established_get_first(seq);
2178                 }
2179                 break;
2180         case TCP_SEQ_STATE_ESTABLISHED:
2181                 rc = established_get_next(seq, v);
2182                 break;
2183         }
2184 out:
2185         ++*pos;
2186         st->last_pos = *pos;
2187         return rc;
2188 }
2189
2190 static void tcp_seq_stop(struct seq_file *seq, void *v)
2191 {
2192         struct tcp_iter_state *st = seq->private;
2193
2194         switch (st->state) {
2195         case TCP_SEQ_STATE_LISTENING:
2196                 if (v != SEQ_START_TOKEN)
2197                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2198                 break;
2199         case TCP_SEQ_STATE_ESTABLISHED:
2200                 if (v)
2201                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2202                 break;
2203         }
2204 }
2205
2206 int tcp_seq_open(struct inode *inode, struct file *file)
2207 {
2208         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2209         struct tcp_iter_state *s;
2210         int err;
2211
2212         err = seq_open_net(inode, file, &afinfo->seq_ops,
2213                           sizeof(struct tcp_iter_state));
2214         if (err < 0)
2215                 return err;
2216
2217         s = ((struct seq_file *)file->private_data)->private;
2218         s->family               = afinfo->family;
2219         s->last_pos             = 0;
2220         return 0;
2221 }
2222 EXPORT_SYMBOL(tcp_seq_open);
2223
2224 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2225 {
2226         int rc = 0;
2227         struct proc_dir_entry *p;
2228
2229         afinfo->seq_ops.start           = tcp_seq_start;
2230         afinfo->seq_ops.next            = tcp_seq_next;
2231         afinfo->seq_ops.stop            = tcp_seq_stop;
2232
2233         p = proc_create_data(afinfo->name, 0444, net->proc_net,
2234                              afinfo->seq_fops, afinfo);
2235         if (!p)
2236                 rc = -ENOMEM;
2237         return rc;
2238 }
2239 EXPORT_SYMBOL(tcp_proc_register);
2240
2241 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2242 {
2243         remove_proc_entry(afinfo->name, net->proc_net);
2244 }
2245 EXPORT_SYMBOL(tcp_proc_unregister);
2246
2247 static void get_openreq4(const struct request_sock *req,
2248                          struct seq_file *f, int i)
2249 {
2250         const struct inet_request_sock *ireq = inet_rsk(req);
2251         long delta = req->rsk_timer.expires - jiffies;
2252
2253         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2254                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2255                 i,
2256                 ireq->ir_loc_addr,
2257                 ireq->ir_num,
2258                 ireq->ir_rmt_addr,
2259                 ntohs(ireq->ir_rmt_port),
2260                 TCP_SYN_RECV,
2261                 0, 0, /* could print option size, but that is af dependent. */
2262                 1,    /* timers active (only the expire timer) */
2263                 jiffies_delta_to_clock_t(delta),
2264                 req->num_timeout,
2265                 from_kuid_munged(seq_user_ns(f),
2266                                  sock_i_uid(req->rsk_listener)),
2267                 0,  /* non standard timer */
2268                 0, /* open_requests have no inode */
2269                 0,
2270                 req);
2271 }
2272
2273 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2274 {
2275         int timer_active;
2276         unsigned long timer_expires;
2277         const struct tcp_sock *tp = tcp_sk(sk);
2278         const struct inet_connection_sock *icsk = inet_csk(sk);
2279         const struct inet_sock *inet = inet_sk(sk);
2280         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2281         __be32 dest = inet->inet_daddr;
2282         __be32 src = inet->inet_rcv_saddr;
2283         __u16 destp = ntohs(inet->inet_dport);
2284         __u16 srcp = ntohs(inet->inet_sport);
2285         int rx_queue;
2286         int state;
2287
2288         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2289             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2290             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2291                 timer_active    = 1;
2292                 timer_expires   = icsk->icsk_timeout;
2293         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2294                 timer_active    = 4;
2295                 timer_expires   = icsk->icsk_timeout;
2296         } else if (timer_pending(&sk->sk_timer)) {
2297                 timer_active    = 2;
2298                 timer_expires   = sk->sk_timer.expires;
2299         } else {
2300                 timer_active    = 0;
2301                 timer_expires = jiffies;
2302         }
2303
2304         state = inet_sk_state_load(sk);
2305         if (state == TCP_LISTEN)
2306                 rx_queue = sk->sk_ack_backlog;
2307         else
2308                 /* Because we don't lock the socket,
2309                  * we might find a transient negative value.
2310                  */
2311                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2312
2313         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2314                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2315                 i, src, srcp, dest, destp, state,
2316                 tp->write_seq - tp->snd_una,
2317                 rx_queue,
2318                 timer_active,
2319                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2320                 icsk->icsk_retransmits,
2321                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2322                 icsk->icsk_probes_out,
2323                 sock_i_ino(sk),
2324                 refcount_read(&sk->sk_refcnt), sk,
2325                 jiffies_to_clock_t(icsk->icsk_rto),
2326                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2327                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2328                 tp->snd_cwnd,
2329                 state == TCP_LISTEN ?
2330                     fastopenq->max_qlen :
2331                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2332 }
2333
2334 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2335                                struct seq_file *f, int i)
2336 {
2337         long delta = tw->tw_timer.expires - jiffies;
2338         __be32 dest, src;
2339         __u16 destp, srcp;
2340
2341         dest  = tw->tw_daddr;
2342         src   = tw->tw_rcv_saddr;
2343         destp = ntohs(tw->tw_dport);
2344         srcp  = ntohs(tw->tw_sport);
2345
2346         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2347                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2348                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2349                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2350                 refcount_read(&tw->tw_refcnt), tw);
2351 }
2352
2353 #define TMPSZ 150
2354
2355 static int tcp4_seq_show(struct seq_file *seq, void *v)
2356 {
2357         struct tcp_iter_state *st;
2358         struct sock *sk = v;
2359
2360         seq_setwidth(seq, TMPSZ - 1);
2361         if (v == SEQ_START_TOKEN) {
2362                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2363                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2364                            "inode");
2365                 goto out;
2366         }
2367         st = seq->private;
2368
2369         if (sk->sk_state == TCP_TIME_WAIT)
2370                 get_timewait4_sock(v, seq, st->num);
2371         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2372                 get_openreq4(v, seq, st->num);
2373         else
2374                 get_tcp4_sock(v, seq, st->num);
2375 out:
2376         seq_pad(seq, '\n');
2377         return 0;
2378 }
2379
2380 static const struct file_operations tcp_afinfo_seq_fops = {
2381         .open    = tcp_seq_open,
2382         .read    = seq_read,
2383         .llseek  = seq_lseek,
2384         .release = seq_release_net
2385 };
2386
2387 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388         .name           = "tcp",
2389         .family         = AF_INET,
2390         .seq_fops       = &tcp_afinfo_seq_fops,
2391         .seq_ops        = {
2392                 .show           = tcp4_seq_show,
2393         },
2394 };
2395
2396 static int __net_init tcp4_proc_init_net(struct net *net)
2397 {
2398         return tcp_proc_register(net, &tcp4_seq_afinfo);
2399 }
2400
2401 static void __net_exit tcp4_proc_exit_net(struct net *net)
2402 {
2403         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2404 }
2405
2406 static struct pernet_operations tcp4_net_ops = {
2407         .init = tcp4_proc_init_net,
2408         .exit = tcp4_proc_exit_net,
2409 };
2410
2411 int __init tcp4_proc_init(void)
2412 {
2413         return register_pernet_subsys(&tcp4_net_ops);
2414 }
2415
2416 void tcp4_proc_exit(void)
2417 {
2418         unregister_pernet_subsys(&tcp4_net_ops);
2419 }
2420 #endif /* CONFIG_PROC_FS */
2421
2422 struct proto tcp_prot = {
2423         .name                   = "TCP",
2424         .owner                  = THIS_MODULE,
2425         .close                  = tcp_close,
2426         .pre_connect            = tcp_v4_pre_connect,
2427         .connect                = tcp_v4_connect,
2428         .disconnect             = tcp_disconnect,
2429         .accept                 = inet_csk_accept,
2430         .ioctl                  = tcp_ioctl,
2431         .init                   = tcp_v4_init_sock,
2432         .destroy                = tcp_v4_destroy_sock,
2433         .shutdown               = tcp_shutdown,
2434         .setsockopt             = tcp_setsockopt,
2435         .getsockopt             = tcp_getsockopt,
2436         .keepalive              = tcp_set_keepalive,
2437         .recvmsg                = tcp_recvmsg,
2438         .sendmsg                = tcp_sendmsg,
2439         .sendpage               = tcp_sendpage,
2440         .backlog_rcv            = tcp_v4_do_rcv,
2441         .release_cb             = tcp_release_cb,
2442         .hash                   = inet_hash,
2443         .unhash                 = inet_unhash,
2444         .get_port               = inet_csk_get_port,
2445         .enter_memory_pressure  = tcp_enter_memory_pressure,
2446         .leave_memory_pressure  = tcp_leave_memory_pressure,
2447         .stream_memory_free     = tcp_stream_memory_free,
2448         .sockets_allocated      = &tcp_sockets_allocated,
2449         .orphan_count           = &tcp_orphan_count,
2450         .memory_allocated       = &tcp_memory_allocated,
2451         .memory_pressure        = &tcp_memory_pressure,
2452         .sysctl_mem             = sysctl_tcp_mem,
2453         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2454         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2455         .max_header             = MAX_TCP_HEADER,
2456         .obj_size               = sizeof(struct tcp_sock),
2457         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2458         .twsk_prot              = &tcp_timewait_sock_ops,
2459         .rsk_prot               = &tcp_request_sock_ops,
2460         .h.hashinfo             = &tcp_hashinfo,
2461         .no_autobind            = true,
2462 #ifdef CONFIG_COMPAT
2463         .compat_setsockopt      = compat_tcp_setsockopt,
2464         .compat_getsockopt      = compat_tcp_getsockopt,
2465 #endif
2466         .diag_destroy           = tcp_abort,
2467 };
2468 EXPORT_SYMBOL(tcp_prot);
2469
2470 static void __net_exit tcp_sk_exit(struct net *net)
2471 {
2472         int cpu;
2473
2474         module_put(net->ipv4.tcp_congestion_control->owner);
2475
2476         for_each_possible_cpu(cpu)
2477                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2478         free_percpu(net->ipv4.tcp_sk);
2479 }
2480
2481 static int __net_init tcp_sk_init(struct net *net)
2482 {
2483         int res, cpu, cnt;
2484
2485         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2486         if (!net->ipv4.tcp_sk)
2487                 return -ENOMEM;
2488
2489         for_each_possible_cpu(cpu) {
2490                 struct sock *sk;
2491
2492                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2493                                            IPPROTO_TCP, net);
2494                 if (res)
2495                         goto fail;
2496                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2497                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2498         }
2499
2500         net->ipv4.sysctl_tcp_ecn = 2;
2501         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2502
2503         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2504         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2505         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2506
2507         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2508         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2509         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2510
2511         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2512         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2513         net->ipv4.sysctl_tcp_syncookies = 1;
2514         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2515         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2516         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2517         net->ipv4.sysctl_tcp_orphan_retries = 0;
2518         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2519         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2520         net->ipv4.sysctl_tcp_tw_reuse = 0;
2521
2522         cnt = tcp_hashinfo.ehash_mask + 1;
2523         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2524         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2525
2526         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2527         net->ipv4.sysctl_tcp_sack = 1;
2528         net->ipv4.sysctl_tcp_window_scaling = 1;
2529         net->ipv4.sysctl_tcp_timestamps = 1;
2530         net->ipv4.sysctl_tcp_early_retrans = 3;
2531         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2532         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2533         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2534         net->ipv4.sysctl_tcp_max_reordering = 300;
2535         net->ipv4.sysctl_tcp_dsack = 1;
2536         net->ipv4.sysctl_tcp_app_win = 31;
2537         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2538         net->ipv4.sysctl_tcp_frto = 2;
2539         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2540         /* This limits the percentage of the congestion window which we
2541          * will allow a single TSO frame to consume.  Building TSO frames
2542          * which are too large can cause TCP streams to be bursty.
2543          */
2544         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2545         /* Default TSQ limit of four TSO segments */
2546         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2547         /* rfc5961 challenge ack rate limiting */
2548         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2549         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2550         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2551         net->ipv4.sysctl_tcp_autocorking = 1;
2552         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2553         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2554         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2555         if (net != &init_net) {
2556                 memcpy(net->ipv4.sysctl_tcp_rmem,
2557                        init_net.ipv4.sysctl_tcp_rmem,
2558                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2559                 memcpy(net->ipv4.sysctl_tcp_wmem,
2560                        init_net.ipv4.sysctl_tcp_wmem,
2561                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2562         }
2563         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2564         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2565         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2566         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2567
2568         /* Reno is always built in */
2569         if (!net_eq(net, &init_net) &&
2570             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2571                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2572         else
2573                 net->ipv4.tcp_congestion_control = &tcp_reno;
2574
2575         return 0;
2576 fail:
2577         tcp_sk_exit(net);
2578
2579         return res;
2580 }
2581
2582 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2583 {
2584         struct net *net;
2585
2586         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2587
2588         list_for_each_entry(net, net_exit_list, exit_list)
2589                 tcp_fastopen_ctx_destroy(net);
2590 }
2591
2592 static struct pernet_operations __net_initdata tcp_sk_ops = {
2593        .init       = tcp_sk_init,
2594        .exit       = tcp_sk_exit,
2595        .exit_batch = tcp_sk_exit_batch,
2596 };
2597
2598 void __init tcp_v4_init(void)
2599 {
2600         if (register_pernet_subsys(&tcp_sk_ops))
2601                 panic("Failed to create the TCP control socket.\n");
2602 }