tcp: add reordering timer in RACK loss detection

author Yuchung Cheng <ycheng@google.com>

Fri, 13 Jan 2017 06:11:33 +0000 (22:11 -0800)

committer David S. Miller <davem@davemloft.net>

Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
author Yuchung Cheng <ycheng@google.com>
Fri, 13 Jan 2017 06:11:33 +0000 (22:11 -0800)
committer David S. Miller <davem@davemloft.net>
Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

index 85ee3879499ebc4ebd63a59b2c425918858154c6..84b2edde09b143bae31b71b0305e55e80962fd75 100644 (file)
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@ struct inet_connection_sock {
  #define ICSK_TIME_PROBE0       3       /* Zero window probe timer */
  #define ICSK_TIME_EARLY_RETRANS 4      /* Early retransmit timer */
  #define ICSK_TIME_LOSS_PROBE   5       /* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT  6       /* Reordering timer */
  
  static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
  {
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
         }
  
         if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-           what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
+           what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+           what == ICSK_TIME_REO_TIMEOUT) {
                 icsk->icsk_pending = what;
                 icsk->icsk_timeout = jiffies + when;
                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 1439107658c2c3bbe8ae50f735b581ab8f212786..64fcdeb3358b5101112c238eacc20ded79c66efb 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                          * for local resources.
                                                          */
+#define TCP_REO_TIMEOUT_MIN    (2000) /* Min RACK reordering timeout in usec */
  
  #define TCP_KEEPALIVE_TIME     (120*60*HZ)     /* two hours */
  #define TCP_KEEPALIVE_PROBES   9               /* Max of 9 keepalive probes    */
@@ -397,6 +398,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  int tcp_child_process(struct sock *parent, struct sock *child,
                       struct sk_buff *skb);
  void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
  void tcp_clear_retrans(struct tcp_sock *tp);
  void tcp_update_metrics(struct sock *sk);
  void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
  void tcp_retransmit_timer(struct sock *sk);
  void tcp_xmit_retransmit_queue(struct sock *);
  void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
  int tcp_trim_head(struct sock *, struct sk_buff *, u32);
  int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
  
@@ -1867,6 +1870,7 @@ extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
  extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
                              const struct skb_mstamp *xmit_time,
                              const struct skb_mstamp *ack_time);
+extern void tcp_rack_reo_timeout(struct sock *sk);
  
  /*
   * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c

index 4dea33e5f29572e09c29621ee8eadc4e60a9a9a2..d216e40623d3defa17afacb90de432793c64577a 100644 (file)
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -216,6 +216,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+           icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                 r->idiag_timer = 1;
                 r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 8ccd171999bfd4c537e25cee94732b5389c85a86..be119182996300320e0800f595b001e0a7abf858 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2522,8 +2522,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
         tcp_ecn_queue_cwr(tp);
  }
  
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
-                              int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         int sndcnt = 0;
@@ -2691,7 +2690,7 @@ void tcp_simple_retransmit(struct sock *sk)
  }
  EXPORT_SYMBOL(tcp_simple_retransmit);
  
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         int mib_idx;
@@ -3031,6 +3030,7 @@ void tcp_rearm_rto(struct sock *sk)
                 u32 rto = inet_csk(sk)->icsk_rto;
                 /* Offset the time elapsed after installing regular RTO */
                 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+                   icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
                     icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                         struct sk_buff *skb = tcp_write_queue_head(sk);
                         const u32 rto_time_stamp =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 56d756ecfb597855065299b0f8b14d73ddfcdbf3..ebf3e0c4967acb500e5b0699ce5b07e0a1e54cf1 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2230,6 +2230,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+           icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                 timer_active    = 1;
                 timer_expires   = icsk->icsk_timeout;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 1d5331a1b1dc2677316148ba9852c191e7ed0fd4..0ba9026cb70d07e79ed3a217096c716326a56f04 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2960,7 +2960,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 if (tcp_in_cwnd_reduction(sk))
                         tp->prr_out += tcp_skb_pcount(skb);
  
-               if (skb == tcp_write_queue_head(sk))
+               if (skb == tcp_write_queue_head(sk) &&
+                   icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                   inet_csk(sk)->icsk_rto,
                                                   TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c

index 557363cde58abb87aec526c935323dfea9eb9229..eb39b1b6d1dc1d43310350af753e165c1e0ed424 100644 (file)
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,19 +32,18 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
   * The current version is only used after recovery starts but can be
   * easily extended to detect the first loss.
   */
-static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+                                u32 *reo_timeout)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
         u32 reo_wnd;
  
+       *reo_timeout = 0;
         /* To be more reordering resilient, allow min_rtt/4 settling delay
          * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
          * RTT because reordering is often a path property and less related
          * to queuing or delayed ACKs.
-        *
-        * TODO: measure and adapt to the observed reordering delay, and
-        * use a timer to retransmit like the delayed early retransmit.
          */
         reo_wnd = 1000;
         if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
@@ -66,10 +65,23 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
                          * A packet is lost if its elapsed time is beyond
                          * the recent RTT plus the reordering window.
                          */
-                       if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
-                           tp->rack.rtt_us + reo_wnd) {
+                       u32 elapsed = skb_mstamp_us_delta(now,
+                                                         &skb->skb_mstamp);
+                       s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
+
+                       if (remaining < 0) {
                                 tcp_rack_mark_skb_lost(sk, skb);
+                               continue;
                         }
+
+                       /* Skip ones marked lost but not yet retransmitted */
+                       if ((scb->sacked & TCPCB_LOST) &&
+                           !(scb->sacked & TCPCB_SACKED_RETRANS))
+                               continue;
+
+                       /* Record maximum wait time (+1 to avoid 0) */
+                       *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
                 } else if (!(scb->sacked & TCPCB_RETRANS)) {
                         /* Original data are sent sequentially so stop early
                          * b/c the rest are all sent after rack_sent
@@ -82,12 +94,19 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
  void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       u32 timeout;
  
         if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
                 return;
+
         /* Reset the advanced flag to avoid unnecessary queue scanning */
         tp->rack.advanced = 0;
-       tcp_rack_detect_loss(sk, now);
+       tcp_rack_detect_loss(sk, now, &timeout);
+       if (timeout) {
+               timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+               inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+                                         timeout, inet_csk(sk)->icsk_rto);
+       }
  }
  
  /* Record the most recently (re)sent time among the (s)acked packets
@@ -123,3 +142,27 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
         tp->rack.mstamp = *xmit_time;
         tp->rack.advanced = 1;
  }
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct skb_mstamp now;
+       u32 timeout, prior_inflight;
+
+       skb_mstamp_get(&now);
+       prior_inflight = tcp_packets_in_flight(tp);
+       tcp_rack_detect_loss(sk, &now, &timeout);
+       if (prior_inflight != tcp_packets_in_flight(tp)) {
+               if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+                       tcp_enter_recovery(sk, false);
+                       if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+                               tcp_cwnd_reduction(sk, 1, 0);
+               }
+               tcp_xmit_retransmit_queue(sk);
+       }
+       if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+               tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index 29a9bd5f1225e96aef9052d69e19b3407f727e83..953c02a8566e0a7f06e6870209e57e58c34d2544 100644 (file)
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,6 +563,9 @@ void tcp_write_timer_handler(struct sock *sk)
         event = icsk->icsk_pending;
  
         switch (event) {
+       case ICSK_TIME_REO_TIMEOUT:
+               tcp_rack_reo_timeout(sk);
+               break;
         case ICSK_TIME_EARLY_RETRANS:
                 tcp_resume_early_retransmit(sk);
                 break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

index 228965dca3c529b1c80994d254fb59a8a19f3089..f52c3742b4044dd22f1ab2ecdabf42821ab5db2a 100644 (file)
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,6 +1746,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+           icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                 timer_active    = 1;
                 timer_expires   = icsk->icsk_timeout;
author	Yuchung Cheng <ycheng@google.com>
	Fri, 13 Jan 2017 06:11:33 +0000 (22:11 -0800)
committer	David S. Miller <davem@davemloft.net>
	Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
include/net/inet_connection_sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/inet_diag.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history
net/ipv4/tcp_recovery.c		patch \| blob \| history
net/ipv4/tcp_timer.c		patch \| blob \| history
net/ipv6/tcp_ipv6.c		patch \| blob \| history