]> asedeno.scripts.mit.edu Git - linux.git/blob - net/netfilter/nft_set_rbtree.c
Merge tag 'juno-fix-5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep...
[linux.git] / net / netfilter / nft_set_rbtree.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
4  *
5  * Development of this code funded by Astaro AG (http://www.astaro.com/)
6  */
7
8 #include <linux/kernel.h>
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/list.h>
12 #include <linux/rbtree.h>
13 #include <linux/netlink.h>
14 #include <linux/netfilter.h>
15 #include <linux/netfilter/nf_tables.h>
16 #include <net/netfilter/nf_tables_core.h>
17
18 struct nft_rbtree {
19         struct rb_root          root;
20         rwlock_t                lock;
21         seqcount_t              count;
22         struct delayed_work     gc_work;
23 };
24
25 struct nft_rbtree_elem {
26         struct rb_node          node;
27         struct nft_set_ext      ext;
28 };
29
30 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
31 {
32         return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33                (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
34 }
35
36 static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
37                              const struct nft_rbtree_elem *interval)
38 {
39         return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
40 }
41
42 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
43                                 const u32 *key, const struct nft_set_ext **ext,
44                                 unsigned int seq)
45 {
46         struct nft_rbtree *priv = nft_set_priv(set);
47         const struct nft_rbtree_elem *rbe, *interval = NULL;
48         u8 genmask = nft_genmask_cur(net);
49         const struct rb_node *parent;
50         const void *this;
51         int d;
52
53         parent = rcu_dereference_raw(priv->root.rb_node);
54         while (parent != NULL) {
55                 if (read_seqcount_retry(&priv->count, seq))
56                         return false;
57
58                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
59
60                 this = nft_set_ext_key(&rbe->ext);
61                 d = memcmp(this, key, set->klen);
62                 if (d < 0) {
63                         parent = rcu_dereference_raw(parent->rb_left);
64                         if (interval &&
65                             nft_rbtree_equal(set, this, interval) &&
66                             nft_rbtree_interval_end(rbe) &&
67                             !nft_rbtree_interval_end(interval))
68                                 continue;
69                         interval = rbe;
70                 } else if (d > 0)
71                         parent = rcu_dereference_raw(parent->rb_right);
72                 else {
73                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
74                                 parent = rcu_dereference_raw(parent->rb_left);
75                                 continue;
76                         }
77                         if (nft_rbtree_interval_end(rbe)) {
78                                 if (nft_set_is_anonymous(set))
79                                         return false;
80                                 parent = rcu_dereference_raw(parent->rb_left);
81                                 interval = NULL;
82                                 continue;
83                         }
84
85                         *ext = &rbe->ext;
86                         return true;
87                 }
88         }
89
90         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
91             nft_set_elem_active(&interval->ext, genmask) &&
92             !nft_rbtree_interval_end(interval)) {
93                 *ext = &interval->ext;
94                 return true;
95         }
96
97         return false;
98 }
99
100 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
101                               const u32 *key, const struct nft_set_ext **ext)
102 {
103         struct nft_rbtree *priv = nft_set_priv(set);
104         unsigned int seq = read_seqcount_begin(&priv->count);
105         bool ret;
106
107         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
108         if (ret || !read_seqcount_retry(&priv->count, seq))
109                 return ret;
110
111         read_lock_bh(&priv->lock);
112         seq = read_seqcount_begin(&priv->count);
113         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
114         read_unlock_bh(&priv->lock);
115
116         return ret;
117 }
118
119 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
120                              const u32 *key, struct nft_rbtree_elem **elem,
121                              unsigned int seq, unsigned int flags, u8 genmask)
122 {
123         struct nft_rbtree_elem *rbe, *interval = NULL;
124         struct nft_rbtree *priv = nft_set_priv(set);
125         const struct rb_node *parent;
126         const void *this;
127         int d;
128
129         parent = rcu_dereference_raw(priv->root.rb_node);
130         while (parent != NULL) {
131                 if (read_seqcount_retry(&priv->count, seq))
132                         return false;
133
134                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
135
136                 this = nft_set_ext_key(&rbe->ext);
137                 d = memcmp(this, key, set->klen);
138                 if (d < 0) {
139                         parent = rcu_dereference_raw(parent->rb_left);
140                         if (!(flags & NFT_SET_ELEM_INTERVAL_END))
141                                 interval = rbe;
142                 } else if (d > 0) {
143                         parent = rcu_dereference_raw(parent->rb_right);
144                         if (flags & NFT_SET_ELEM_INTERVAL_END)
145                                 interval = rbe;
146                 } else {
147                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
148                                 parent = rcu_dereference_raw(parent->rb_left);
149                                 continue;
150                         }
151
152                         if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
153                             (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
154                             (flags & NFT_SET_ELEM_INTERVAL_END)) {
155                                 *elem = rbe;
156                                 return true;
157                         }
158
159                         if (nft_rbtree_interval_end(rbe))
160                                 interval = NULL;
161
162                         parent = rcu_dereference_raw(parent->rb_left);
163                 }
164         }
165
166         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
167             nft_set_elem_active(&interval->ext, genmask) &&
168             ((!nft_rbtree_interval_end(interval) &&
169               !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
170              (nft_rbtree_interval_end(interval) &&
171               (flags & NFT_SET_ELEM_INTERVAL_END)))) {
172                 *elem = interval;
173                 return true;
174         }
175
176         return false;
177 }
178
179 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
180                             const struct nft_set_elem *elem, unsigned int flags)
181 {
182         struct nft_rbtree *priv = nft_set_priv(set);
183         unsigned int seq = read_seqcount_begin(&priv->count);
184         struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
185         const u32 *key = (const u32 *)&elem->key.val;
186         u8 genmask = nft_genmask_cur(net);
187         bool ret;
188
189         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
190         if (ret || !read_seqcount_retry(&priv->count, seq))
191                 return rbe;
192
193         read_lock_bh(&priv->lock);
194         seq = read_seqcount_begin(&priv->count);
195         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
196         if (!ret)
197                 rbe = ERR_PTR(-ENOENT);
198         read_unlock_bh(&priv->lock);
199
200         return rbe;
201 }
202
203 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
204                                struct nft_rbtree_elem *new,
205                                struct nft_set_ext **ext)
206 {
207         struct nft_rbtree *priv = nft_set_priv(set);
208         u8 genmask = nft_genmask_next(net);
209         struct nft_rbtree_elem *rbe;
210         struct rb_node *parent, **p;
211         int d;
212
213         parent = NULL;
214         p = &priv->root.rb_node;
215         while (*p != NULL) {
216                 parent = *p;
217                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
218                 d = memcmp(nft_set_ext_key(&rbe->ext),
219                            nft_set_ext_key(&new->ext),
220                            set->klen);
221                 if (d < 0)
222                         p = &parent->rb_left;
223                 else if (d > 0)
224                         p = &parent->rb_right;
225                 else {
226                         if (nft_rbtree_interval_end(rbe) &&
227                             !nft_rbtree_interval_end(new)) {
228                                 p = &parent->rb_left;
229                         } else if (!nft_rbtree_interval_end(rbe) &&
230                                    nft_rbtree_interval_end(new)) {
231                                 p = &parent->rb_right;
232                         } else if (nft_set_elem_active(&rbe->ext, genmask)) {
233                                 *ext = &rbe->ext;
234                                 return -EEXIST;
235                         } else {
236                                 p = &parent->rb_left;
237                         }
238                 }
239         }
240         rb_link_node_rcu(&new->node, parent, p);
241         rb_insert_color(&new->node, &priv->root);
242         return 0;
243 }
244
245 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
246                              const struct nft_set_elem *elem,
247                              struct nft_set_ext **ext)
248 {
249         struct nft_rbtree *priv = nft_set_priv(set);
250         struct nft_rbtree_elem *rbe = elem->priv;
251         int err;
252
253         write_lock_bh(&priv->lock);
254         write_seqcount_begin(&priv->count);
255         err = __nft_rbtree_insert(net, set, rbe, ext);
256         write_seqcount_end(&priv->count);
257         write_unlock_bh(&priv->lock);
258
259         return err;
260 }
261
262 static void nft_rbtree_remove(const struct net *net,
263                               const struct nft_set *set,
264                               const struct nft_set_elem *elem)
265 {
266         struct nft_rbtree *priv = nft_set_priv(set);
267         struct nft_rbtree_elem *rbe = elem->priv;
268
269         write_lock_bh(&priv->lock);
270         write_seqcount_begin(&priv->count);
271         rb_erase(&rbe->node, &priv->root);
272         write_seqcount_end(&priv->count);
273         write_unlock_bh(&priv->lock);
274 }
275
276 static void nft_rbtree_activate(const struct net *net,
277                                 const struct nft_set *set,
278                                 const struct nft_set_elem *elem)
279 {
280         struct nft_rbtree_elem *rbe = elem->priv;
281
282         nft_set_elem_change_active(net, set, &rbe->ext);
283         nft_set_elem_clear_busy(&rbe->ext);
284 }
285
286 static bool nft_rbtree_flush(const struct net *net,
287                              const struct nft_set *set, void *priv)
288 {
289         struct nft_rbtree_elem *rbe = priv;
290
291         if (!nft_set_elem_mark_busy(&rbe->ext) ||
292             !nft_is_active(net, &rbe->ext)) {
293                 nft_set_elem_change_active(net, set, &rbe->ext);
294                 return true;
295         }
296         return false;
297 }
298
299 static void *nft_rbtree_deactivate(const struct net *net,
300                                    const struct nft_set *set,
301                                    const struct nft_set_elem *elem)
302 {
303         const struct nft_rbtree *priv = nft_set_priv(set);
304         const struct rb_node *parent = priv->root.rb_node;
305         struct nft_rbtree_elem *rbe, *this = elem->priv;
306         u8 genmask = nft_genmask_next(net);
307         int d;
308
309         while (parent != NULL) {
310                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
311
312                 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
313                                            set->klen);
314                 if (d < 0)
315                         parent = parent->rb_left;
316                 else if (d > 0)
317                         parent = parent->rb_right;
318                 else {
319                         if (nft_rbtree_interval_end(rbe) &&
320                             !nft_rbtree_interval_end(this)) {
321                                 parent = parent->rb_left;
322                                 continue;
323                         } else if (!nft_rbtree_interval_end(rbe) &&
324                                    nft_rbtree_interval_end(this)) {
325                                 parent = parent->rb_right;
326                                 continue;
327                         } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
328                                 parent = parent->rb_left;
329                                 continue;
330                         }
331                         nft_rbtree_flush(net, set, rbe);
332                         return rbe;
333                 }
334         }
335         return NULL;
336 }
337
338 static void nft_rbtree_walk(const struct nft_ctx *ctx,
339                             struct nft_set *set,
340                             struct nft_set_iter *iter)
341 {
342         struct nft_rbtree *priv = nft_set_priv(set);
343         struct nft_rbtree_elem *rbe;
344         struct nft_set_elem elem;
345         struct rb_node *node;
346
347         read_lock_bh(&priv->lock);
348         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
349                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
350
351                 if (iter->count < iter->skip)
352                         goto cont;
353                 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
354                         goto cont;
355
356                 elem.priv = rbe;
357
358                 iter->err = iter->fn(ctx, set, iter, &elem);
359                 if (iter->err < 0) {
360                         read_unlock_bh(&priv->lock);
361                         return;
362                 }
363 cont:
364                 iter->count++;
365         }
366         read_unlock_bh(&priv->lock);
367 }
368
369 static void nft_rbtree_gc(struct work_struct *work)
370 {
371         struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
372         struct nft_set_gc_batch *gcb = NULL;
373         struct nft_rbtree *priv;
374         struct rb_node *node;
375         struct nft_set *set;
376
377         priv = container_of(work, struct nft_rbtree, gc_work.work);
378         set  = nft_set_container_of(priv);
379
380         write_lock_bh(&priv->lock);
381         write_seqcount_begin(&priv->count);
382         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
383                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
384
385                 if (nft_rbtree_interval_end(rbe)) {
386                         rbe_end = rbe;
387                         continue;
388                 }
389                 if (!nft_set_elem_expired(&rbe->ext))
390                         continue;
391                 if (nft_set_elem_mark_busy(&rbe->ext))
392                         continue;
393
394                 if (rbe_prev) {
395                         rb_erase(&rbe_prev->node, &priv->root);
396                         rbe_prev = NULL;
397                 }
398                 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
399                 if (!gcb)
400                         break;
401
402                 atomic_dec(&set->nelems);
403                 nft_set_gc_batch_add(gcb, rbe);
404                 rbe_prev = rbe;
405
406                 if (rbe_end) {
407                         atomic_dec(&set->nelems);
408                         nft_set_gc_batch_add(gcb, rbe_end);
409                         rb_erase(&rbe_end->node, &priv->root);
410                         rbe_end = NULL;
411                 }
412                 node = rb_next(node);
413                 if (!node)
414                         break;
415         }
416         if (rbe_prev)
417                 rb_erase(&rbe_prev->node, &priv->root);
418         write_seqcount_end(&priv->count);
419         write_unlock_bh(&priv->lock);
420
421         nft_set_gc_batch_complete(gcb);
422
423         queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
424                            nft_set_gc_interval(set));
425 }
426
427 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
428                                const struct nft_set_desc *desc)
429 {
430         return sizeof(struct nft_rbtree);
431 }
432
433 static int nft_rbtree_init(const struct nft_set *set,
434                            const struct nft_set_desc *desc,
435                            const struct nlattr * const nla[])
436 {
437         struct nft_rbtree *priv = nft_set_priv(set);
438
439         rwlock_init(&priv->lock);
440         seqcount_init(&priv->count);
441         priv->root = RB_ROOT;
442
443         INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
444         if (set->flags & NFT_SET_TIMEOUT)
445                 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
446                                    nft_set_gc_interval(set));
447
448         return 0;
449 }
450
451 static void nft_rbtree_destroy(const struct nft_set *set)
452 {
453         struct nft_rbtree *priv = nft_set_priv(set);
454         struct nft_rbtree_elem *rbe;
455         struct rb_node *node;
456
457         cancel_delayed_work_sync(&priv->gc_work);
458         rcu_barrier();
459         while ((node = priv->root.rb_node) != NULL) {
460                 rb_erase(node, &priv->root);
461                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
462                 nft_set_elem_destroy(set, rbe, true);
463         }
464 }
465
466 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
467                                 struct nft_set_estimate *est)
468 {
469         if (desc->field_count > 1)
470                 return false;
471
472         if (desc->size)
473                 est->size = sizeof(struct nft_rbtree) +
474                             desc->size * sizeof(struct nft_rbtree_elem);
475         else
476                 est->size = ~0;
477
478         est->lookup = NFT_SET_CLASS_O_LOG_N;
479         est->space  = NFT_SET_CLASS_O_N;
480
481         return true;
482 }
483
484 struct nft_set_type nft_set_rbtree_type __read_mostly = {
485         .owner          = THIS_MODULE,
486         .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
487         .ops            = {
488                 .privsize       = nft_rbtree_privsize,
489                 .elemsize       = offsetof(struct nft_rbtree_elem, ext),
490                 .estimate       = nft_rbtree_estimate,
491                 .init           = nft_rbtree_init,
492                 .destroy        = nft_rbtree_destroy,
493                 .insert         = nft_rbtree_insert,
494                 .remove         = nft_rbtree_remove,
495                 .deactivate     = nft_rbtree_deactivate,
496                 .flush          = nft_rbtree_flush,
497                 .activate       = nft_rbtree_activate,
498                 .lookup         = nft_rbtree_lookup,
499                 .walk           = nft_rbtree_walk,
500                 .get            = nft_rbtree_get,
501         },
502 };