Merge tag 'ras-urgent-2020-02-22' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux.git] / net / netfilter / nf_conntrack_core.c
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c

index d1305423640f3abfcaf7e3295022c765d31d82e5..1927fc296f9514bcd5866d340c6f659bea0fdb3e 100644 (file)
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -894,32 +894,175 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
         }
  }
  
-/* Resolve race on insertion if this protocol allows this. */
+static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
+{
+       struct nf_conn_tstamp *tstamp;
+
+       atomic_inc(&ct->ct_general.use);
+       ct->status |= IPS_CONFIRMED;
+
+       /* set conntrack timestamp, if enabled. */
+       tstamp = nf_conn_tstamp_find(ct);
+       if (tstamp)
+               tstamp->start = ktime_get_real_ns();
+}
+
+static int __nf_ct_resolve_clash(struct sk_buff *skb,
+                                struct nf_conntrack_tuple_hash *h)
+{
+       /* This is the conntrack entry already in hashes that won race. */
+       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *loser_ct;
+
+       loser_ct = nf_ct_get(skb, &ctinfo);
+
+       if (nf_ct_is_dying(ct))
+               return NF_DROP;
+
+       if (!atomic_inc_not_zero(&ct->ct_general.use))
+               return NF_DROP;
+
+       if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+           nf_ct_match(ct, loser_ct)) {
+               struct net *net = nf_ct_net(ct);
+
+               nf_ct_acct_merge(ct, ctinfo, loser_ct);
+               nf_ct_add_to_dying_list(loser_ct);
+               nf_conntrack_put(&loser_ct->ct_general);
+               nf_ct_set(skb, ct, ctinfo);
+
+               NF_CT_STAT_INC(net, insert_failed);
+               return NF_ACCEPT;
+       }
+
+       nf_ct_put(ct);
+       return NF_DROP;
+}
+
+/**
+ * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
+ *
+ * @skb: skb that causes the collision
+ * @repl_idx: hash slot for reply direction
+ *
+ * Called when origin or reply direction had a clash.
+ * The skb can be handled without packet drop provided the reply direction
+ * is unique or there the existing entry has the identical tuple in both
+ * directions.
+ *
+ * Caller must hold conntrack table locks to prevent concurrent updates.
+ *
+ * Returns NF_DROP if the clash could not be handled.
+ */
+static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
+{
+       struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
+       const struct nf_conntrack_zone *zone;
+       struct nf_conntrack_tuple_hash *h;
+       struct hlist_nulls_node *n;
+       struct net *net;
+
+       zone = nf_ct_zone(loser_ct);
+       net = nf_ct_net(loser_ct);
+
+       /* Reply direction must never result in a clash, unless both origin
+        * and reply tuples are identical.
+        */
+       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
+               if (nf_ct_key_equal(h,
+                                   &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+                                   zone, net))
+                       return __nf_ct_resolve_clash(skb, h);
+       }
+
+       /* We want the clashing entry to go away real soon: 1 second timeout. */
+       loser_ct->timeout = nfct_time_stamp + HZ;
+
+       /* IPS_NAT_CLASH removes the entry automatically on the first
+        * reply.  Also prevents UDP tracker from moving the entry to
+        * ASSURED state, i.e. the entry can always be evicted under
+        * pressure.
+        */
+       loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
+
+       __nf_conntrack_insert_prepare(loser_ct);
+
+       /* fake add for ORIGINAL dir: we want lookups to only find the entry
+        * already in the table.  This also hides the clashing entry from
+        * ctnetlink iteration, i.e. conntrack -L won't show them.
+        */
+       hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+
+       hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+                                &nf_conntrack_hash[repl_idx]);
+       return NF_ACCEPT;
+}
+
+/**
+ * nf_ct_resolve_clash - attempt to handle clash without packet drop
+ *
+ * @skb: skb that causes the clash
+ * @h: tuplehash of the clashing entry already in table
+ * @hash_reply: hash slot for reply direction
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple.
+ *
+ * If there is one, @skb (and the assocated, unconfirmed conntrack) has
+ * to be dropped.  In case @skb is retransmitted, next conntrack lookup
+ * will find the already-existing entry.
+ *
+ * The major problem with such packet drop is the extra delay added by
+ * the packet loss -- it will take some time for a retransmit to occur
+ * (or the sender to time out when waiting for a reply).
+ *
+ * This function attempts to handle the situation without packet drop.
+ *
+ * If @skb has no NAT transformation or if the colliding entries are
+ * exactly the same, only the to-be-confirmed conntrack entry is discarded
+ * and @skb is associated with the conntrack entry already in the table.
+ *
+ * Failing that, the new, unconfirmed conntrack is still added to the table
+ * provided that the collision only occurs in the ORIGINAL direction.
+ * The new entry will be added after the existing one in the hash list,
+ * so packets in the ORIGINAL direction will continue to match the existing
+ * entry.  The new entry will also have a fixed timeout so it expires --
+ * due to the collision, it will not see bidirectional traffic.
+ *
+ * Returns NF_DROP if the clash could not be resolved.
+ */
  static __cold noinline int
-nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
-                   enum ip_conntrack_info ctinfo,
-                   struct nf_conntrack_tuple_hash *h)
+nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
+                   u32 reply_hash)
  {
         /* This is the conntrack entry already in hashes that won race. */
         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
         const struct nf_conntrack_l4proto *l4proto;
-       enum ip_conntrack_info oldinfo;
-       struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *loser_ct;
+       struct net *net;
+       int ret;
+
+       loser_ct = nf_ct_get(skb, &ctinfo);
+       net = nf_ct_net(loser_ct);
  
         l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
-       if (l4proto->allow_clash &&
-           !nf_ct_is_dying(ct) &&
-           atomic_inc_not_zero(&ct->ct_general.use)) {
-               if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
-                   nf_ct_match(ct, loser_ct)) {
-                       nf_ct_acct_merge(ct, ctinfo, loser_ct);
-                       nf_conntrack_put(&loser_ct->ct_general);
-                       nf_ct_set(skb, ct, oldinfo);
-                       return NF_ACCEPT;
-               }
-               nf_ct_put(ct);
-       }
+       if (!l4proto->allow_clash)
+               goto drop;
+
+       ret = __nf_ct_resolve_clash(skb, h);
+       if (ret == NF_ACCEPT)
+               return ret;
+
+       ret = nf_ct_resolve_clash_harder(skb, reply_hash);
+       if (ret == NF_ACCEPT)
+               return ret;
+
+drop:
+       nf_ct_add_to_dying_list(loser_ct);
         NF_CT_STAT_INC(net, drop);
+       NF_CT_STAT_INC(net, insert_failed);
         return NF_DROP;
  }
  
@@ -932,7 +1075,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
         struct nf_conntrack_tuple_hash *h;
         struct nf_conn *ct;
         struct nf_conn_help *help;
-       struct nf_conn_tstamp *tstamp;
         struct hlist_nulls_node *n;
         enum ip_conntrack_info ctinfo;
         struct net *net;
@@ -989,6 +1131,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
  
         if (unlikely(nf_ct_is_dying(ct))) {
                 nf_ct_add_to_dying_list(ct);
+               NF_CT_STAT_INC(net, insert_failed);
                 goto dying;
         }
  
@@ -1009,13 +1152,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
            setting time, otherwise we'd get timer wrap in
            weird delay cases. */
         ct->timeout += nfct_time_stamp;
-       atomic_inc(&ct->ct_general.use);
-       ct->status |= IPS_CONFIRMED;
  
-       /* set conntrack timestamp, if enabled. */
-       tstamp = nf_conn_tstamp_find(ct);
-       if (tstamp)
-               tstamp->start = ktime_get_real_ns();
+       __nf_conntrack_insert_prepare(ct);
  
         /* Since the lookup is lockless, hash insertion must be done after
          * starting the timer and setting the CONFIRMED bit. The RCU barriers
@@ -1035,11 +1173,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
         return NF_ACCEPT;
  
  out:
-       nf_ct_add_to_dying_list(ct);
-       ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+       ret = nf_ct_resolve_clash(skb, h, reply_hash);
  dying:
         nf_conntrack_double_unlock(hash, reply_hash);
-       NF_CT_STAT_INC(net, insert_failed);
         local_bh_enable();
         return ret;
  }