]> asedeno.scripts.mit.edu Git - linux.git/blob - net/sched/sch_api.c
Merge tag 'microblaze-v5.6-rc1' of git://git.monstr.eu/linux-2.6-microblaze
[linux.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 /*
36
37    Short review.
38    -------------
39
40    This file consists of two interrelated parts:
41
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63
64    All real intelligent work is done inside qdisc modules.
65
66
67
68    Every discipline has two major routines: enqueue and dequeue.
69
70    ---dequeue
71
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78
79    ---enqueue
80
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP        - this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88
89    Auxiliary routines:
90
91    ---peek
92
93    like dequeue but without removing a packet from the queue
94
95    ---reset
96
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99
100    ---init
101
102    initializes newly created qdisc.
103
104    ---destroy
105
106    destroys resources allocated by init and during lifetime of qdisc.
107
108    ---change
109
110    changes qdisc parameters.
111  */
112
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117 /************************************************
118  *      Queueing disciplines manipulation.      *
119  ************************************************/
120
121
122 /* The list of all installed queueing disciplines. */
123
124 static struct Qdisc_ops *qdisc_base;
125
126 /* Register/unregister queueing discipline */
127
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130         struct Qdisc_ops *q, **qp;
131         int rc = -EEXIST;
132
133         write_lock(&qdisc_mod_lock);
134         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135                 if (!strcmp(qops->id, q->id))
136                         goto out;
137
138         if (qops->enqueue == NULL)
139                 qops->enqueue = noop_qdisc_ops.enqueue;
140         if (qops->peek == NULL) {
141                 if (qops->dequeue == NULL)
142                         qops->peek = noop_qdisc_ops.peek;
143                 else
144                         goto out_einval;
145         }
146         if (qops->dequeue == NULL)
147                 qops->dequeue = noop_qdisc_ops.dequeue;
148
149         if (qops->cl_ops) {
150                 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152                 if (!(cops->find && cops->walk && cops->leaf))
153                         goto out_einval;
154
155                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156                         goto out_einval;
157         }
158
159         qops->next = NULL;
160         *qp = qops;
161         rc = 0;
162 out:
163         write_unlock(&qdisc_mod_lock);
164         return rc;
165
166 out_einval:
167         rc = -EINVAL;
168         goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194         read_lock(&qdisc_mod_lock);
195         strlcpy(name, default_qdisc_ops->id, len);
196         read_unlock(&qdisc_mod_lock);
197 }
198
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201         struct Qdisc_ops *q = NULL;
202
203         for (q = qdisc_base; q; q = q->next) {
204                 if (!strcmp(name, q->id)) {
205                         if (!try_module_get(q->owner))
206                                 q = NULL;
207                         break;
208                 }
209         }
210
211         return q;
212 }
213
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217         const struct Qdisc_ops *ops;
218
219         if (!capable(CAP_NET_ADMIN))
220                 return -EPERM;
221
222         write_lock(&qdisc_mod_lock);
223         ops = qdisc_lookup_default(name);
224         if (!ops) {
225                 /* Not found, drop lock and try to load module */
226                 write_unlock(&qdisc_mod_lock);
227                 request_module("sch_%s", name);
228                 write_lock(&qdisc_mod_lock);
229
230                 ops = qdisc_lookup_default(name);
231         }
232
233         if (ops) {
234                 /* Set new default */
235                 module_put(default_qdisc_ops->owner);
236                 default_qdisc_ops = ops;
237         }
238         write_unlock(&qdisc_mod_lock);
239
240         return ops ? 0 : -ENOENT;
241 }
242
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259         struct Qdisc *q;
260
261         if (!qdisc_dev(root))
262                 return (root->handle == handle ? root : NULL);
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 ASSERT_RTNL();
279                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280                 if (invisible)
281                         q->flags |= TCQ_F_INVISIBLE;
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(dev->qdisc, handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315         struct netdev_queue *nq;
316         struct Qdisc *q;
317
318         if (!handle)
319                 return NULL;
320         q = qdisc_match_from_root(dev->qdisc, handle);
321         if (q)
322                 goto out;
323
324         nq = dev_ingress_queue_rcu(dev);
325         if (nq)
326                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328         return q;
329 }
330
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333         unsigned long cl;
334         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336         if (cops == NULL)
337                 return NULL;
338         cl = cops->find(p, classid);
339
340         if (cl == 0)
341                 return NULL;
342         return cops->leaf(p, cl);
343 }
344
345 /* Find queueing discipline by name */
346
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349         struct Qdisc_ops *q = NULL;
350
351         if (kind) {
352                 read_lock(&qdisc_mod_lock);
353                 for (q = qdisc_base; q; q = q->next) {
354                         if (nla_strcmp(kind, q->id) == 0) {
355                                 if (!try_module_get(q->owner))
356                                         q = NULL;
357                                 break;
358                         }
359                 }
360                 read_unlock(&qdisc_mod_lock);
361         }
362         return q;
363 }
364
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384         int low       = roundup(r->mpu, 48);
385         int high      = roundup(low+1, 48);
386         int cell_low  = low >> r->cell_log;
387         int cell_high = (high >> r->cell_log) - 1;
388
389         /* rtab is too inaccurate at rates > 100Mbit/s */
390         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391                 pr_debug("TC linklayer: Giving up ATM detection\n");
392                 return TC_LINKLAYER_ETHERNET;
393         }
394
395         if ((cell_high > cell_low) && (cell_high < 256)
396             && (rtab[cell_low] == rtab[cell_high])) {
397                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398                          cell_low, cell_high, rtab[cell_high]);
399                 return TC_LINKLAYER_ATM;
400         }
401         return TC_LINKLAYER_ETHERNET;
402 }
403
404 static struct qdisc_rate_table *qdisc_rtab_list;
405
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407                                         struct nlattr *tab,
408                                         struct netlink_ext_ack *extack)
409 {
410         struct qdisc_rate_table *rtab;
411
412         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
413             nla_len(tab) != TC_RTAB_SIZE) {
414                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
415                 return NULL;
416         }
417
418         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
419                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
420                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
421                         rtab->refcnt++;
422                         return rtab;
423                 }
424         }
425
426         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
427         if (rtab) {
428                 rtab->rate = *r;
429                 rtab->refcnt = 1;
430                 memcpy(rtab->data, nla_data(tab), 1024);
431                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
432                         r->linklayer = __detect_linklayer(r, rtab->data);
433                 rtab->next = qdisc_rtab_list;
434                 qdisc_rtab_list = rtab;
435         } else {
436                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
437         }
438         return rtab;
439 }
440 EXPORT_SYMBOL(qdisc_get_rtab);
441
442 void qdisc_put_rtab(struct qdisc_rate_table *tab)
443 {
444         struct qdisc_rate_table *rtab, **rtabp;
445
446         if (!tab || --tab->refcnt)
447                 return;
448
449         for (rtabp = &qdisc_rtab_list;
450              (rtab = *rtabp) != NULL;
451              rtabp = &rtab->next) {
452                 if (rtab == tab) {
453                         *rtabp = rtab->next;
454                         kfree(rtab);
455                         return;
456                 }
457         }
458 }
459 EXPORT_SYMBOL(qdisc_put_rtab);
460
461 static LIST_HEAD(qdisc_stab_list);
462
463 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
464         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
465         [TCA_STAB_DATA] = { .type = NLA_BINARY },
466 };
467
468 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
469                                                struct netlink_ext_ack *extack)
470 {
471         struct nlattr *tb[TCA_STAB_MAX + 1];
472         struct qdisc_size_table *stab;
473         struct tc_sizespec *s;
474         unsigned int tsize = 0;
475         u16 *tab = NULL;
476         int err;
477
478         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
479                                           extack);
480         if (err < 0)
481                 return ERR_PTR(err);
482         if (!tb[TCA_STAB_BASE]) {
483                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
484                 return ERR_PTR(-EINVAL);
485         }
486
487         s = nla_data(tb[TCA_STAB_BASE]);
488
489         if (s->tsize > 0) {
490                 if (!tb[TCA_STAB_DATA]) {
491                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
492                         return ERR_PTR(-EINVAL);
493                 }
494                 tab = nla_data(tb[TCA_STAB_DATA]);
495                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
496         }
497
498         if (tsize != s->tsize || (!tab && tsize > 0)) {
499                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
500                 return ERR_PTR(-EINVAL);
501         }
502
503         list_for_each_entry(stab, &qdisc_stab_list, list) {
504                 if (memcmp(&stab->szopts, s, sizeof(*s)))
505                         continue;
506                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
507                         continue;
508                 stab->refcnt++;
509                 return stab;
510         }
511
512         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
513         if (!stab)
514                 return ERR_PTR(-ENOMEM);
515
516         stab->refcnt = 1;
517         stab->szopts = *s;
518         if (tsize > 0)
519                 memcpy(stab->data, tab, tsize * sizeof(u16));
520
521         list_add_tail(&stab->list, &qdisc_stab_list);
522
523         return stab;
524 }
525
526 void qdisc_put_stab(struct qdisc_size_table *tab)
527 {
528         if (!tab)
529                 return;
530
531         if (--tab->refcnt == 0) {
532                 list_del(&tab->list);
533                 kfree_rcu(tab, rcu);
534         }
535 }
536 EXPORT_SYMBOL(qdisc_put_stab);
537
538 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
539 {
540         struct nlattr *nest;
541
542         nest = nla_nest_start_noflag(skb, TCA_STAB);
543         if (nest == NULL)
544                 goto nla_put_failure;
545         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
546                 goto nla_put_failure;
547         nla_nest_end(skb, nest);
548
549         return skb->len;
550
551 nla_put_failure:
552         return -1;
553 }
554
555 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
556                                const struct qdisc_size_table *stab)
557 {
558         int pkt_len, slot;
559
560         pkt_len = skb->len + stab->szopts.overhead;
561         if (unlikely(!stab->szopts.tsize))
562                 goto out;
563
564         slot = pkt_len + stab->szopts.cell_align;
565         if (unlikely(slot < 0))
566                 slot = 0;
567
568         slot >>= stab->szopts.cell_log;
569         if (likely(slot < stab->szopts.tsize))
570                 pkt_len = stab->data[slot];
571         else
572                 pkt_len = stab->data[stab->szopts.tsize - 1] *
573                                 (slot / stab->szopts.tsize) +
574                                 stab->data[slot % stab->szopts.tsize];
575
576         pkt_len <<= stab->szopts.size_log;
577 out:
578         if (unlikely(pkt_len < 1))
579                 pkt_len = 1;
580         qdisc_skb_cb(skb)->pkt_len = pkt_len;
581 }
582 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
583
584 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
585 {
586         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
587                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
588                         txt, qdisc->ops->id, qdisc->handle >> 16);
589                 qdisc->flags |= TCQ_F_WARN_NONWC;
590         }
591 }
592 EXPORT_SYMBOL(qdisc_warn_nonwc);
593
594 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
595 {
596         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
597                                                  timer);
598
599         rcu_read_lock();
600         __netif_schedule(qdisc_root(wd->qdisc));
601         rcu_read_unlock();
602
603         return HRTIMER_NORESTART;
604 }
605
606 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
607                                  clockid_t clockid)
608 {
609         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
610         wd->timer.function = qdisc_watchdog;
611         wd->qdisc = qdisc;
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
614
615 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
616 {
617         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_init);
620
621 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
622 {
623         if (test_bit(__QDISC_STATE_DEACTIVATED,
624                      &qdisc_root_sleeping(wd->qdisc)->state))
625                 return;
626
627         if (wd->last_expires == expires)
628                 return;
629
630         wd->last_expires = expires;
631         hrtimer_start(&wd->timer,
632                       ns_to_ktime(expires),
633                       HRTIMER_MODE_ABS_PINNED);
634 }
635 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
636
637 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
638 {
639         hrtimer_cancel(&wd->timer);
640 }
641 EXPORT_SYMBOL(qdisc_watchdog_cancel);
642
643 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
644 {
645         struct hlist_head *h;
646         unsigned int i;
647
648         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
649
650         if (h != NULL) {
651                 for (i = 0; i < n; i++)
652                         INIT_HLIST_HEAD(&h[i]);
653         }
654         return h;
655 }
656
657 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
658 {
659         struct Qdisc_class_common *cl;
660         struct hlist_node *next;
661         struct hlist_head *nhash, *ohash;
662         unsigned int nsize, nmask, osize;
663         unsigned int i, h;
664
665         /* Rehash when load factor exceeds 0.75 */
666         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
667                 return;
668         nsize = clhash->hashsize * 2;
669         nmask = nsize - 1;
670         nhash = qdisc_class_hash_alloc(nsize);
671         if (nhash == NULL)
672                 return;
673
674         ohash = clhash->hash;
675         osize = clhash->hashsize;
676
677         sch_tree_lock(sch);
678         for (i = 0; i < osize; i++) {
679                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
680                         h = qdisc_class_hash(cl->classid, nmask);
681                         hlist_add_head(&cl->hnode, &nhash[h]);
682                 }
683         }
684         clhash->hash     = nhash;
685         clhash->hashsize = nsize;
686         clhash->hashmask = nmask;
687         sch_tree_unlock(sch);
688
689         kvfree(ohash);
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_grow);
692
693 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
694 {
695         unsigned int size = 4;
696
697         clhash->hash = qdisc_class_hash_alloc(size);
698         if (!clhash->hash)
699                 return -ENOMEM;
700         clhash->hashsize  = size;
701         clhash->hashmask  = size - 1;
702         clhash->hashelems = 0;
703         return 0;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_init);
706
707 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
708 {
709         kvfree(clhash->hash);
710 }
711 EXPORT_SYMBOL(qdisc_class_hash_destroy);
712
713 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
714                              struct Qdisc_class_common *cl)
715 {
716         unsigned int h;
717
718         INIT_HLIST_NODE(&cl->hnode);
719         h = qdisc_class_hash(cl->classid, clhash->hashmask);
720         hlist_add_head(&cl->hnode, &clhash->hash[h]);
721         clhash->hashelems++;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_insert);
724
725 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
726                              struct Qdisc_class_common *cl)
727 {
728         hlist_del(&cl->hnode);
729         clhash->hashelems--;
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_remove);
732
733 /* Allocate an unique handle from space managed by kernel
734  * Possible range is [8000-FFFF]:0000 (0x8000 values)
735  */
736 static u32 qdisc_alloc_handle(struct net_device *dev)
737 {
738         int i = 0x8000;
739         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
740
741         do {
742                 autohandle += TC_H_MAKE(0x10000U, 0);
743                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
744                         autohandle = TC_H_MAKE(0x80000000U, 0);
745                 if (!qdisc_lookup(dev, autohandle))
746                         return autohandle;
747                 cond_resched();
748         } while (--i > 0);
749
750         return 0;
751 }
752
753 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
754 {
755         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
756         const struct Qdisc_class_ops *cops;
757         unsigned long cl;
758         u32 parentid;
759         bool notify;
760         int drops;
761
762         if (n == 0 && len == 0)
763                 return;
764         drops = max_t(int, n, 0);
765         rcu_read_lock();
766         while ((parentid = sch->parent)) {
767                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
768                         break;
769
770                 if (sch->flags & TCQ_F_NOPARENT)
771                         break;
772                 /* Notify parent qdisc only if child qdisc becomes empty.
773                  *
774                  * If child was empty even before update then backlog
775                  * counter is screwed and we skip notification because
776                  * parent class is already passive.
777                  *
778                  * If the original child was offloaded then it is allowed
779                  * to be seem as empty, so the parent is notified anyway.
780                  */
781                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
782                                                        !qdisc_is_offloaded);
783                 /* TODO: perform the search on a per txq basis */
784                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
785                 if (sch == NULL) {
786                         WARN_ON_ONCE(parentid != TC_H_ROOT);
787                         break;
788                 }
789                 cops = sch->ops->cl_ops;
790                 if (notify && cops->qlen_notify) {
791                         cl = cops->find(sch, parentid);
792                         cops->qlen_notify(sch, cl);
793                 }
794                 sch->q.qlen -= n;
795                 sch->qstats.backlog -= len;
796                 __qdisc_qstats_drop(sch, drops);
797         }
798         rcu_read_unlock();
799 }
800 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
801
802 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
803                               void *type_data)
804 {
805         struct net_device *dev = qdisc_dev(sch);
806         int err;
807
808         sch->flags &= ~TCQ_F_OFFLOADED;
809         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
810                 return 0;
811
812         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
813         if (err == -EOPNOTSUPP)
814                 return 0;
815
816         if (!err)
817                 sch->flags |= TCQ_F_OFFLOADED;
818
819         return err;
820 }
821 EXPORT_SYMBOL(qdisc_offload_dump_helper);
822
823 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
824                                 struct Qdisc *new, struct Qdisc *old,
825                                 enum tc_setup_type type, void *type_data,
826                                 struct netlink_ext_ack *extack)
827 {
828         bool any_qdisc_is_offloaded;
829         int err;
830
831         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
832                 return;
833
834         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
835
836         /* Don't report error if the graft is part of destroy operation. */
837         if (!err || !new || new == &noop_qdisc)
838                 return;
839
840         /* Don't report error if the parent, the old child and the new
841          * one are not offloaded.
842          */
843         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
844         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
845         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
846
847         if (any_qdisc_is_offloaded)
848                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
849 }
850 EXPORT_SYMBOL(qdisc_offload_graft_helper);
851
852 static void qdisc_offload_graft_root(struct net_device *dev,
853                                      struct Qdisc *new, struct Qdisc *old,
854                                      struct netlink_ext_ack *extack)
855 {
856         struct tc_root_qopt_offload graft_offload = {
857                 .command        = TC_ROOT_GRAFT,
858                 .handle         = new ? new->handle : 0,
859                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
860                                   (old && old->flags & TCQ_F_INGRESS),
861         };
862
863         qdisc_offload_graft_helper(dev, NULL, new, old,
864                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
865 }
866
867 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
868                          u32 portid, u32 seq, u16 flags, int event)
869 {
870         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
871         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
872         struct tcmsg *tcm;
873         struct nlmsghdr  *nlh;
874         unsigned char *b = skb_tail_pointer(skb);
875         struct gnet_dump d;
876         struct qdisc_size_table *stab;
877         u32 block_index;
878         __u32 qlen;
879
880         cond_resched();
881         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
882         if (!nlh)
883                 goto out_nlmsg_trim;
884         tcm = nlmsg_data(nlh);
885         tcm->tcm_family = AF_UNSPEC;
886         tcm->tcm__pad1 = 0;
887         tcm->tcm__pad2 = 0;
888         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
889         tcm->tcm_parent = clid;
890         tcm->tcm_handle = q->handle;
891         tcm->tcm_info = refcount_read(&q->refcnt);
892         if (nla_put_string(skb, TCA_KIND, q->ops->id))
893                 goto nla_put_failure;
894         if (q->ops->ingress_block_get) {
895                 block_index = q->ops->ingress_block_get(q);
896                 if (block_index &&
897                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
898                         goto nla_put_failure;
899         }
900         if (q->ops->egress_block_get) {
901                 block_index = q->ops->egress_block_get(q);
902                 if (block_index &&
903                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
904                         goto nla_put_failure;
905         }
906         if (q->ops->dump && q->ops->dump(q, skb) < 0)
907                 goto nla_put_failure;
908         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
909                 goto nla_put_failure;
910         qlen = qdisc_qlen_sum(q);
911
912         stab = rtnl_dereference(q->stab);
913         if (stab && qdisc_dump_stab(skb, stab) < 0)
914                 goto nla_put_failure;
915
916         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
917                                          NULL, &d, TCA_PAD) < 0)
918                 goto nla_put_failure;
919
920         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
921                 goto nla_put_failure;
922
923         if (qdisc_is_percpu_stats(q)) {
924                 cpu_bstats = q->cpu_bstats;
925                 cpu_qstats = q->cpu_qstats;
926         }
927
928         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
929                                   &d, cpu_bstats, &q->bstats) < 0 ||
930             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
931             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
932                 goto nla_put_failure;
933
934         if (gnet_stats_finish_copy(&d) < 0)
935                 goto nla_put_failure;
936
937         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
938         return skb->len;
939
940 out_nlmsg_trim:
941 nla_put_failure:
942         nlmsg_trim(skb, b);
943         return -1;
944 }
945
946 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
947 {
948         if (q->flags & TCQ_F_BUILTIN)
949                 return true;
950         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
951                 return true;
952
953         return false;
954 }
955
956 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
957                         struct nlmsghdr *n, u32 clid,
958                         struct Qdisc *old, struct Qdisc *new)
959 {
960         struct sk_buff *skb;
961         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
962
963         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
964         if (!skb)
965                 return -ENOBUFS;
966
967         if (old && !tc_qdisc_dump_ignore(old, false)) {
968                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
969                                   0, RTM_DELQDISC) < 0)
970                         goto err_out;
971         }
972         if (new && !tc_qdisc_dump_ignore(new, false)) {
973                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
974                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
975                         goto err_out;
976         }
977
978         if (skb->len)
979                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
980                                       n->nlmsg_flags & NLM_F_ECHO);
981
982 err_out:
983         kfree_skb(skb);
984         return -EINVAL;
985 }
986
987 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
988                                struct nlmsghdr *n, u32 clid,
989                                struct Qdisc *old, struct Qdisc *new)
990 {
991         if (new || old)
992                 qdisc_notify(net, skb, n, clid, old, new);
993
994         if (old)
995                 qdisc_put(old);
996 }
997
998 static void qdisc_clear_nolock(struct Qdisc *sch)
999 {
1000         sch->flags &= ~TCQ_F_NOLOCK;
1001         if (!(sch->flags & TCQ_F_CPUSTATS))
1002                 return;
1003
1004         free_percpu(sch->cpu_bstats);
1005         free_percpu(sch->cpu_qstats);
1006         sch->cpu_bstats = NULL;
1007         sch->cpu_qstats = NULL;
1008         sch->flags &= ~TCQ_F_CPUSTATS;
1009 }
1010
1011 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1012  * to device "dev".
1013  *
1014  * When appropriate send a netlink notification using 'skb'
1015  * and "n".
1016  *
1017  * On success, destroy old qdisc.
1018  */
1019
1020 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1021                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1022                        struct Qdisc *new, struct Qdisc *old,
1023                        struct netlink_ext_ack *extack)
1024 {
1025         struct Qdisc *q = old;
1026         struct net *net = dev_net(dev);
1027
1028         if (parent == NULL) {
1029                 unsigned int i, num_q, ingress;
1030
1031                 ingress = 0;
1032                 num_q = dev->num_tx_queues;
1033                 if ((q && q->flags & TCQ_F_INGRESS) ||
1034                     (new && new->flags & TCQ_F_INGRESS)) {
1035                         num_q = 1;
1036                         ingress = 1;
1037                         if (!dev_ingress_queue(dev)) {
1038                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1039                                 return -ENOENT;
1040                         }
1041                 }
1042
1043                 if (dev->flags & IFF_UP)
1044                         dev_deactivate(dev);
1045
1046                 qdisc_offload_graft_root(dev, new, old, extack);
1047
1048                 if (new && new->ops->attach)
1049                         goto skip;
1050
1051                 for (i = 0; i < num_q; i++) {
1052                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1053
1054                         if (!ingress)
1055                                 dev_queue = netdev_get_tx_queue(dev, i);
1056
1057                         old = dev_graft_qdisc(dev_queue, new);
1058                         if (new && i > 0)
1059                                 qdisc_refcount_inc(new);
1060
1061                         if (!ingress)
1062                                 qdisc_put(old);
1063                 }
1064
1065 skip:
1066                 if (!ingress) {
1067                         notify_and_destroy(net, skb, n, classid,
1068                                            dev->qdisc, new);
1069                         if (new && !new->ops->attach)
1070                                 qdisc_refcount_inc(new);
1071                         dev->qdisc = new ? : &noop_qdisc;
1072
1073                         if (new && new->ops->attach)
1074                                 new->ops->attach(new);
1075                 } else {
1076                         notify_and_destroy(net, skb, n, classid, old, new);
1077                 }
1078
1079                 if (dev->flags & IFF_UP)
1080                         dev_activate(dev);
1081         } else {
1082                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1083                 unsigned long cl;
1084                 int err;
1085
1086                 /* Only support running class lockless if parent is lockless */
1087                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1088                     parent && !(parent->flags & TCQ_F_NOLOCK))
1089                         qdisc_clear_nolock(new);
1090
1091                 if (!cops || !cops->graft)
1092                         return -EOPNOTSUPP;
1093
1094                 cl = cops->find(parent, classid);
1095                 if (!cl) {
1096                         NL_SET_ERR_MSG(extack, "Specified class not found");
1097                         return -ENOENT;
1098                 }
1099
1100                 err = cops->graft(parent, cl, new, &old, extack);
1101                 if (err)
1102                         return err;
1103                 notify_and_destroy(net, skb, n, classid, old, new);
1104         }
1105         return 0;
1106 }
1107
1108 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1109                                    struct netlink_ext_ack *extack)
1110 {
1111         u32 block_index;
1112
1113         if (tca[TCA_INGRESS_BLOCK]) {
1114                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1115
1116                 if (!block_index) {
1117                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1118                         return -EINVAL;
1119                 }
1120                 if (!sch->ops->ingress_block_set) {
1121                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1122                         return -EOPNOTSUPP;
1123                 }
1124                 sch->ops->ingress_block_set(sch, block_index);
1125         }
1126         if (tca[TCA_EGRESS_BLOCK]) {
1127                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1128
1129                 if (!block_index) {
1130                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1131                         return -EINVAL;
1132                 }
1133                 if (!sch->ops->egress_block_set) {
1134                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1135                         return -EOPNOTSUPP;
1136                 }
1137                 sch->ops->egress_block_set(sch, block_index);
1138         }
1139         return 0;
1140 }
1141
1142 /*
1143    Allocate and initialize new qdisc.
1144
1145    Parameters are passed via opt.
1146  */
1147
1148 static struct Qdisc *qdisc_create(struct net_device *dev,
1149                                   struct netdev_queue *dev_queue,
1150                                   struct Qdisc *p, u32 parent, u32 handle,
1151                                   struct nlattr **tca, int *errp,
1152                                   struct netlink_ext_ack *extack)
1153 {
1154         int err;
1155         struct nlattr *kind = tca[TCA_KIND];
1156         struct Qdisc *sch;
1157         struct Qdisc_ops *ops;
1158         struct qdisc_size_table *stab;
1159
1160         ops = qdisc_lookup_ops(kind);
1161 #ifdef CONFIG_MODULES
1162         if (ops == NULL && kind != NULL) {
1163                 char name[IFNAMSIZ];
1164                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1165                         /* We dropped the RTNL semaphore in order to
1166                          * perform the module load.  So, even if we
1167                          * succeeded in loading the module we have to
1168                          * tell the caller to replay the request.  We
1169                          * indicate this using -EAGAIN.
1170                          * We replay the request because the device may
1171                          * go away in the mean time.
1172                          */
1173                         rtnl_unlock();
1174                         request_module("sch_%s", name);
1175                         rtnl_lock();
1176                         ops = qdisc_lookup_ops(kind);
1177                         if (ops != NULL) {
1178                                 /* We will try again qdisc_lookup_ops,
1179                                  * so don't keep a reference.
1180                                  */
1181                                 module_put(ops->owner);
1182                                 err = -EAGAIN;
1183                                 goto err_out;
1184                         }
1185                 }
1186         }
1187 #endif
1188
1189         err = -ENOENT;
1190         if (!ops) {
1191                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1192                 goto err_out;
1193         }
1194
1195         sch = qdisc_alloc(dev_queue, ops, extack);
1196         if (IS_ERR(sch)) {
1197                 err = PTR_ERR(sch);
1198                 goto err_out2;
1199         }
1200
1201         sch->parent = parent;
1202
1203         if (handle == TC_H_INGRESS) {
1204                 sch->flags |= TCQ_F_INGRESS;
1205                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1206         } else {
1207                 if (handle == 0) {
1208                         handle = qdisc_alloc_handle(dev);
1209                         if (handle == 0) {
1210                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1211                                 err = -ENOSPC;
1212                                 goto err_out3;
1213                         }
1214                 }
1215                 if (!netif_is_multiqueue(dev))
1216                         sch->flags |= TCQ_F_ONETXQUEUE;
1217         }
1218
1219         sch->handle = handle;
1220
1221         /* This exist to keep backward compatible with a userspace
1222          * loophole, what allowed userspace to get IFF_NO_QUEUE
1223          * facility on older kernels by setting tx_queue_len=0 (prior
1224          * to qdisc init), and then forgot to reinit tx_queue_len
1225          * before again attaching a qdisc.
1226          */
1227         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1228                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1229                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1230         }
1231
1232         err = qdisc_block_indexes_set(sch, tca, extack);
1233         if (err)
1234                 goto err_out3;
1235
1236         if (ops->init) {
1237                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1238                 if (err != 0)
1239                         goto err_out5;
1240         }
1241
1242         if (tca[TCA_STAB]) {
1243                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1244                 if (IS_ERR(stab)) {
1245                         err = PTR_ERR(stab);
1246                         goto err_out4;
1247                 }
1248                 rcu_assign_pointer(sch->stab, stab);
1249         }
1250         if (tca[TCA_RATE]) {
1251                 seqcount_t *running;
1252
1253                 err = -EOPNOTSUPP;
1254                 if (sch->flags & TCQ_F_MQROOT) {
1255                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1256                         goto err_out4;
1257                 }
1258
1259                 if (sch->parent != TC_H_ROOT &&
1260                     !(sch->flags & TCQ_F_INGRESS) &&
1261                     (!p || !(p->flags & TCQ_F_MQROOT)))
1262                         running = qdisc_root_sleeping_running(sch);
1263                 else
1264                         running = &sch->running;
1265
1266                 err = gen_new_estimator(&sch->bstats,
1267                                         sch->cpu_bstats,
1268                                         &sch->rate_est,
1269                                         NULL,
1270                                         running,
1271                                         tca[TCA_RATE]);
1272                 if (err) {
1273                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1274                         goto err_out4;
1275                 }
1276         }
1277
1278         qdisc_hash_add(sch, false);
1279
1280         return sch;
1281
1282 err_out5:
1283         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1284         if (ops->destroy)
1285                 ops->destroy(sch);
1286 err_out3:
1287         dev_put(dev);
1288         qdisc_free(sch);
1289 err_out2:
1290         module_put(ops->owner);
1291 err_out:
1292         *errp = err;
1293         return NULL;
1294
1295 err_out4:
1296         /*
1297          * Any broken qdiscs that would require a ops->reset() here?
1298          * The qdisc was never in action so it shouldn't be necessary.
1299          */
1300         qdisc_put_stab(rtnl_dereference(sch->stab));
1301         if (ops->destroy)
1302                 ops->destroy(sch);
1303         goto err_out3;
1304 }
1305
1306 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1307                         struct netlink_ext_ack *extack)
1308 {
1309         struct qdisc_size_table *ostab, *stab = NULL;
1310         int err = 0;
1311
1312         if (tca[TCA_OPTIONS]) {
1313                 if (!sch->ops->change) {
1314                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1315                         return -EINVAL;
1316                 }
1317                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1318                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1319                         return -EOPNOTSUPP;
1320                 }
1321                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1322                 if (err)
1323                         return err;
1324         }
1325
1326         if (tca[TCA_STAB]) {
1327                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1328                 if (IS_ERR(stab))
1329                         return PTR_ERR(stab);
1330         }
1331
1332         ostab = rtnl_dereference(sch->stab);
1333         rcu_assign_pointer(sch->stab, stab);
1334         qdisc_put_stab(ostab);
1335
1336         if (tca[TCA_RATE]) {
1337                 /* NB: ignores errors from replace_estimator
1338                    because change can't be undone. */
1339                 if (sch->flags & TCQ_F_MQROOT)
1340                         goto out;
1341                 gen_replace_estimator(&sch->bstats,
1342                                       sch->cpu_bstats,
1343                                       &sch->rate_est,
1344                                       NULL,
1345                                       qdisc_root_sleeping_running(sch),
1346                                       tca[TCA_RATE]);
1347         }
1348 out:
1349         return 0;
1350 }
1351
1352 struct check_loop_arg {
1353         struct qdisc_walker     w;
1354         struct Qdisc            *p;
1355         int                     depth;
1356 };
1357
1358 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1359                          struct qdisc_walker *w);
1360
1361 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1362 {
1363         struct check_loop_arg   arg;
1364
1365         if (q->ops->cl_ops == NULL)
1366                 return 0;
1367
1368         arg.w.stop = arg.w.skip = arg.w.count = 0;
1369         arg.w.fn = check_loop_fn;
1370         arg.depth = depth;
1371         arg.p = p;
1372         q->ops->cl_ops->walk(q, &arg.w);
1373         return arg.w.stop ? -ELOOP : 0;
1374 }
1375
1376 static int
1377 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1378 {
1379         struct Qdisc *leaf;
1380         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1381         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1382
1383         leaf = cops->leaf(q, cl);
1384         if (leaf) {
1385                 if (leaf == arg->p || arg->depth > 7)
1386                         return -ELOOP;
1387                 return check_loop(leaf, arg->p, arg->depth + 1);
1388         }
1389         return 0;
1390 }
1391
1392 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1393         [TCA_KIND]              = { .type = NLA_STRING },
1394         [TCA_RATE]              = { .type = NLA_BINARY,
1395                                     .len = sizeof(struct tc_estimator) },
1396         [TCA_STAB]              = { .type = NLA_NESTED },
1397         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1398         [TCA_CHAIN]             = { .type = NLA_U32 },
1399         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1400         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1401 };
1402
1403 /*
1404  * Delete/get qdisc.
1405  */
1406
1407 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1408                         struct netlink_ext_ack *extack)
1409 {
1410         struct net *net = sock_net(skb->sk);
1411         struct tcmsg *tcm = nlmsg_data(n);
1412         struct nlattr *tca[TCA_MAX + 1];
1413         struct net_device *dev;
1414         u32 clid;
1415         struct Qdisc *q = NULL;
1416         struct Qdisc *p = NULL;
1417         int err;
1418
1419         if ((n->nlmsg_type != RTM_GETQDISC) &&
1420             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1421                 return -EPERM;
1422
1423         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1424                                      rtm_tca_policy, extack);
1425         if (err < 0)
1426                 return err;
1427
1428         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1429         if (!dev)
1430                 return -ENODEV;
1431
1432         clid = tcm->tcm_parent;
1433         if (clid) {
1434                 if (clid != TC_H_ROOT) {
1435                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1436                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1437                                 if (!p) {
1438                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1439                                         return -ENOENT;
1440                                 }
1441                                 q = qdisc_leaf(p, clid);
1442                         } else if (dev_ingress_queue(dev)) {
1443                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1444                         }
1445                 } else {
1446                         q = dev->qdisc;
1447                 }
1448                 if (!q) {
1449                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1450                         return -ENOENT;
1451                 }
1452
1453                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1454                         NL_SET_ERR_MSG(extack, "Invalid handle");
1455                         return -EINVAL;
1456                 }
1457         } else {
1458                 q = qdisc_lookup(dev, tcm->tcm_handle);
1459                 if (!q) {
1460                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1461                         return -ENOENT;
1462                 }
1463         }
1464
1465         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1466                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1467                 return -EINVAL;
1468         }
1469
1470         if (n->nlmsg_type == RTM_DELQDISC) {
1471                 if (!clid) {
1472                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1473                         return -EINVAL;
1474                 }
1475                 if (q->handle == 0) {
1476                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1477                         return -ENOENT;
1478                 }
1479                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1480                 if (err != 0)
1481                         return err;
1482         } else {
1483                 qdisc_notify(net, skb, n, clid, NULL, q);
1484         }
1485         return 0;
1486 }
1487
1488 /*
1489  * Create/change qdisc.
1490  */
1491
1492 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1493                            struct netlink_ext_ack *extack)
1494 {
1495         struct net *net = sock_net(skb->sk);
1496         struct tcmsg *tcm;
1497         struct nlattr *tca[TCA_MAX + 1];
1498         struct net_device *dev;
1499         u32 clid;
1500         struct Qdisc *q, *p;
1501         int err;
1502
1503         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1504                 return -EPERM;
1505
1506 replay:
1507         /* Reinit, just in case something touches this. */
1508         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1509                                      rtm_tca_policy, extack);
1510         if (err < 0)
1511                 return err;
1512
1513         tcm = nlmsg_data(n);
1514         clid = tcm->tcm_parent;
1515         q = p = NULL;
1516
1517         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1518         if (!dev)
1519                 return -ENODEV;
1520
1521
1522         if (clid) {
1523                 if (clid != TC_H_ROOT) {
1524                         if (clid != TC_H_INGRESS) {
1525                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1526                                 if (!p) {
1527                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1528                                         return -ENOENT;
1529                                 }
1530                                 q = qdisc_leaf(p, clid);
1531                         } else if (dev_ingress_queue_create(dev)) {
1532                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1533                         }
1534                 } else {
1535                         q = dev->qdisc;
1536                 }
1537
1538                 /* It may be default qdisc, ignore it */
1539                 if (q && q->handle == 0)
1540                         q = NULL;
1541
1542                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1543                         if (tcm->tcm_handle) {
1544                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1545                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1546                                         return -EEXIST;
1547                                 }
1548                                 if (TC_H_MIN(tcm->tcm_handle)) {
1549                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1550                                         return -EINVAL;
1551                                 }
1552                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1553                                 if (!q)
1554                                         goto create_n_graft;
1555                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1556                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1557                                         return -EEXIST;
1558                                 }
1559                                 if (tca[TCA_KIND] &&
1560                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1561                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1562                                         return -EINVAL;
1563                                 }
1564                                 if (q == p ||
1565                                     (p && check_loop(q, p, 0))) {
1566                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1567                                         return -ELOOP;
1568                                 }
1569                                 qdisc_refcount_inc(q);
1570                                 goto graft;
1571                         } else {
1572                                 if (!q)
1573                                         goto create_n_graft;
1574
1575                                 /* This magic test requires explanation.
1576                                  *
1577                                  *   We know, that some child q is already
1578                                  *   attached to this parent and have choice:
1579                                  *   either to change it or to create/graft new one.
1580                                  *
1581                                  *   1. We are allowed to create/graft only
1582                                  *   if CREATE and REPLACE flags are set.
1583                                  *
1584                                  *   2. If EXCL is set, requestor wanted to say,
1585                                  *   that qdisc tcm_handle is not expected
1586                                  *   to exist, so that we choose create/graft too.
1587                                  *
1588                                  *   3. The last case is when no flags are set.
1589                                  *   Alas, it is sort of hole in API, we
1590                                  *   cannot decide what to do unambiguously.
1591                                  *   For now we select create/graft, if
1592                                  *   user gave KIND, which does not match existing.
1593                                  */
1594                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1595                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1596                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1597                                      (tca[TCA_KIND] &&
1598                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1599                                         goto create_n_graft;
1600                         }
1601                 }
1602         } else {
1603                 if (!tcm->tcm_handle) {
1604                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1605                         return -EINVAL;
1606                 }
1607                 q = qdisc_lookup(dev, tcm->tcm_handle);
1608         }
1609
1610         /* Change qdisc parameters */
1611         if (!q) {
1612                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1613                 return -ENOENT;
1614         }
1615         if (n->nlmsg_flags & NLM_F_EXCL) {
1616                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1617                 return -EEXIST;
1618         }
1619         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1620                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1621                 return -EINVAL;
1622         }
1623         err = qdisc_change(q, tca, extack);
1624         if (err == 0)
1625                 qdisc_notify(net, skb, n, clid, NULL, q);
1626         return err;
1627
1628 create_n_graft:
1629         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1630                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1631                 return -ENOENT;
1632         }
1633         if (clid == TC_H_INGRESS) {
1634                 if (dev_ingress_queue(dev)) {
1635                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1636                                          tcm->tcm_parent, tcm->tcm_parent,
1637                                          tca, &err, extack);
1638                 } else {
1639                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1640                         err = -ENOENT;
1641                 }
1642         } else {
1643                 struct netdev_queue *dev_queue;
1644
1645                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1646                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1647                 else if (p)
1648                         dev_queue = p->dev_queue;
1649                 else
1650                         dev_queue = netdev_get_tx_queue(dev, 0);
1651
1652                 q = qdisc_create(dev, dev_queue, p,
1653                                  tcm->tcm_parent, tcm->tcm_handle,
1654                                  tca, &err, extack);
1655         }
1656         if (q == NULL) {
1657                 if (err == -EAGAIN)
1658                         goto replay;
1659                 return err;
1660         }
1661
1662 graft:
1663         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1664         if (err) {
1665                 if (q)
1666                         qdisc_put(q);
1667                 return err;
1668         }
1669
1670         return 0;
1671 }
1672
1673 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1674                               struct netlink_callback *cb,
1675                               int *q_idx_p, int s_q_idx, bool recur,
1676                               bool dump_invisible)
1677 {
1678         int ret = 0, q_idx = *q_idx_p;
1679         struct Qdisc *q;
1680         int b;
1681
1682         if (!root)
1683                 return 0;
1684
1685         q = root;
1686         if (q_idx < s_q_idx) {
1687                 q_idx++;
1688         } else {
1689                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1690                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1691                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1692                                   RTM_NEWQDISC) <= 0)
1693                         goto done;
1694                 q_idx++;
1695         }
1696
1697         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1698          * itself has already been dumped.
1699          *
1700          * If we've already dumped the top-level (ingress) qdisc above and the global
1701          * qdisc hashtable, we don't want to hit it again
1702          */
1703         if (!qdisc_dev(root) || !recur)
1704                 goto out;
1705
1706         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1707                 if (q_idx < s_q_idx) {
1708                         q_idx++;
1709                         continue;
1710                 }
1711                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1712                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1713                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1714                                   RTM_NEWQDISC) <= 0)
1715                         goto done;
1716                 q_idx++;
1717         }
1718
1719 out:
1720         *q_idx_p = q_idx;
1721         return ret;
1722 done:
1723         ret = -1;
1724         goto out;
1725 }
1726
1727 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1728 {
1729         struct net *net = sock_net(skb->sk);
1730         int idx, q_idx;
1731         int s_idx, s_q_idx;
1732         struct net_device *dev;
1733         const struct nlmsghdr *nlh = cb->nlh;
1734         struct nlattr *tca[TCA_MAX + 1];
1735         int err;
1736
1737         s_idx = cb->args[0];
1738         s_q_idx = q_idx = cb->args[1];
1739
1740         idx = 0;
1741         ASSERT_RTNL();
1742
1743         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1744                                      rtm_tca_policy, cb->extack);
1745         if (err < 0)
1746                 return err;
1747
1748         for_each_netdev(net, dev) {
1749                 struct netdev_queue *dev_queue;
1750
1751                 if (idx < s_idx)
1752                         goto cont;
1753                 if (idx > s_idx)
1754                         s_q_idx = 0;
1755                 q_idx = 0;
1756
1757                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1758                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1759                         goto done;
1760
1761                 dev_queue = dev_ingress_queue(dev);
1762                 if (dev_queue &&
1763                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1764                                        &q_idx, s_q_idx, false,
1765                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1766                         goto done;
1767
1768 cont:
1769                 idx++;
1770         }
1771
1772 done:
1773         cb->args[0] = idx;
1774         cb->args[1] = q_idx;
1775
1776         return skb->len;
1777 }
1778
1779
1780
1781 /************************************************
1782  *      Traffic classes manipulation.           *
1783  ************************************************/
1784
1785 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1786                           unsigned long cl,
1787                           u32 portid, u32 seq, u16 flags, int event)
1788 {
1789         struct tcmsg *tcm;
1790         struct nlmsghdr  *nlh;
1791         unsigned char *b = skb_tail_pointer(skb);
1792         struct gnet_dump d;
1793         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1794
1795         cond_resched();
1796         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1797         if (!nlh)
1798                 goto out_nlmsg_trim;
1799         tcm = nlmsg_data(nlh);
1800         tcm->tcm_family = AF_UNSPEC;
1801         tcm->tcm__pad1 = 0;
1802         tcm->tcm__pad2 = 0;
1803         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1804         tcm->tcm_parent = q->handle;
1805         tcm->tcm_handle = q->handle;
1806         tcm->tcm_info = 0;
1807         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1808                 goto nla_put_failure;
1809         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1810                 goto nla_put_failure;
1811
1812         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1813                                          NULL, &d, TCA_PAD) < 0)
1814                 goto nla_put_failure;
1815
1816         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1817                 goto nla_put_failure;
1818
1819         if (gnet_stats_finish_copy(&d) < 0)
1820                 goto nla_put_failure;
1821
1822         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1823         return skb->len;
1824
1825 out_nlmsg_trim:
1826 nla_put_failure:
1827         nlmsg_trim(skb, b);
1828         return -1;
1829 }
1830
1831 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1832                          struct nlmsghdr *n, struct Qdisc *q,
1833                          unsigned long cl, int event)
1834 {
1835         struct sk_buff *skb;
1836         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1837         int err = 0;
1838
1839         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1840         if (!skb)
1841                 return -ENOBUFS;
1842
1843         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1844                 kfree_skb(skb);
1845                 return -EINVAL;
1846         }
1847
1848         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1849                              n->nlmsg_flags & NLM_F_ECHO);
1850         if (err > 0)
1851                 err = 0;
1852         return err;
1853 }
1854
1855 static int tclass_del_notify(struct net *net,
1856                              const struct Qdisc_class_ops *cops,
1857                              struct sk_buff *oskb, struct nlmsghdr *n,
1858                              struct Qdisc *q, unsigned long cl)
1859 {
1860         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1861         struct sk_buff *skb;
1862         int err = 0;
1863
1864         if (!cops->delete)
1865                 return -EOPNOTSUPP;
1866
1867         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1868         if (!skb)
1869                 return -ENOBUFS;
1870
1871         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1872                            RTM_DELTCLASS) < 0) {
1873                 kfree_skb(skb);
1874                 return -EINVAL;
1875         }
1876
1877         err = cops->delete(q, cl);
1878         if (err) {
1879                 kfree_skb(skb);
1880                 return err;
1881         }
1882
1883         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1884                              n->nlmsg_flags & NLM_F_ECHO);
1885         if (err > 0)
1886                 err = 0;
1887         return err;
1888 }
1889
1890 #ifdef CONFIG_NET_CLS
1891
1892 struct tcf_bind_args {
1893         struct tcf_walker w;
1894         unsigned long base;
1895         unsigned long cl;
1896         u32 classid;
1897 };
1898
1899 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1900 {
1901         struct tcf_bind_args *a = (void *)arg;
1902
1903         if (tp->ops->bind_class) {
1904                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1905
1906                 sch_tree_lock(q);
1907                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1908                 sch_tree_unlock(q);
1909         }
1910         return 0;
1911 }
1912
1913 struct tc_bind_class_args {
1914         struct qdisc_walker w;
1915         unsigned long new_cl;
1916         u32 portid;
1917         u32 clid;
1918 };
1919
1920 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1921                                 struct qdisc_walker *w)
1922 {
1923         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1924         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1925         struct tcf_block *block;
1926         struct tcf_chain *chain;
1927
1928         block = cops->tcf_block(q, cl, NULL);
1929         if (!block)
1930                 return 0;
1931         for (chain = tcf_get_next_chain(block, NULL);
1932              chain;
1933              chain = tcf_get_next_chain(block, chain)) {
1934                 struct tcf_proto *tp;
1935
1936                 for (tp = tcf_get_next_proto(chain, NULL, true);
1937                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1938                         struct tcf_bind_args arg = {};
1939
1940                         arg.w.fn = tcf_node_bind;
1941                         arg.classid = a->clid;
1942                         arg.base = cl;
1943                         arg.cl = a->new_cl;
1944                         tp->ops->walk(tp, &arg.w, true);
1945                 }
1946         }
1947
1948         return 0;
1949 }
1950
1951 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1952                            unsigned long new_cl)
1953 {
1954         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1955         struct tc_bind_class_args args = {};
1956
1957         if (!cops->tcf_block)
1958                 return;
1959         args.portid = portid;
1960         args.clid = clid;
1961         args.new_cl = new_cl;
1962         args.w.fn = tc_bind_class_walker;
1963         q->ops->cl_ops->walk(q, &args.w);
1964 }
1965
1966 #else
1967
1968 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1969                            unsigned long new_cl)
1970 {
1971 }
1972
1973 #endif
1974
1975 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1976                          struct netlink_ext_ack *extack)
1977 {
1978         struct net *net = sock_net(skb->sk);
1979         struct tcmsg *tcm = nlmsg_data(n);
1980         struct nlattr *tca[TCA_MAX + 1];
1981         struct net_device *dev;
1982         struct Qdisc *q = NULL;
1983         const struct Qdisc_class_ops *cops;
1984         unsigned long cl = 0;
1985         unsigned long new_cl;
1986         u32 portid;
1987         u32 clid;
1988         u32 qid;
1989         int err;
1990
1991         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1992             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1993                 return -EPERM;
1994
1995         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1996                                      rtm_tca_policy, extack);
1997         if (err < 0)
1998                 return err;
1999
2000         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2001         if (!dev)
2002                 return -ENODEV;
2003
2004         /*
2005            parent == TC_H_UNSPEC - unspecified parent.
2006            parent == TC_H_ROOT   - class is root, which has no parent.
2007            parent == X:0         - parent is root class.
2008            parent == X:Y         - parent is a node in hierarchy.
2009            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2010
2011            handle == 0:0         - generate handle from kernel pool.
2012            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2013            handle == X:Y         - clear.
2014            handle == X:0         - root class.
2015          */
2016
2017         /* Step 1. Determine qdisc handle X:0 */
2018
2019         portid = tcm->tcm_parent;
2020         clid = tcm->tcm_handle;
2021         qid = TC_H_MAJ(clid);
2022
2023         if (portid != TC_H_ROOT) {
2024                 u32 qid1 = TC_H_MAJ(portid);
2025
2026                 if (qid && qid1) {
2027                         /* If both majors are known, they must be identical. */
2028                         if (qid != qid1)
2029                                 return -EINVAL;
2030                 } else if (qid1) {
2031                         qid = qid1;
2032                 } else if (qid == 0)
2033                         qid = dev->qdisc->handle;
2034
2035                 /* Now qid is genuine qdisc handle consistent
2036                  * both with parent and child.
2037                  *
2038                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2039                  */
2040                 if (portid)
2041                         portid = TC_H_MAKE(qid, portid);
2042         } else {
2043                 if (qid == 0)
2044                         qid = dev->qdisc->handle;
2045         }
2046
2047         /* OK. Locate qdisc */
2048         q = qdisc_lookup(dev, qid);
2049         if (!q)
2050                 return -ENOENT;
2051
2052         /* An check that it supports classes */
2053         cops = q->ops->cl_ops;
2054         if (cops == NULL)
2055                 return -EINVAL;
2056
2057         /* Now try to get class */
2058         if (clid == 0) {
2059                 if (portid == TC_H_ROOT)
2060                         clid = qid;
2061         } else
2062                 clid = TC_H_MAKE(qid, clid);
2063
2064         if (clid)
2065                 cl = cops->find(q, clid);
2066
2067         if (cl == 0) {
2068                 err = -ENOENT;
2069                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2070                     !(n->nlmsg_flags & NLM_F_CREATE))
2071                         goto out;
2072         } else {
2073                 switch (n->nlmsg_type) {
2074                 case RTM_NEWTCLASS:
2075                         err = -EEXIST;
2076                         if (n->nlmsg_flags & NLM_F_EXCL)
2077                                 goto out;
2078                         break;
2079                 case RTM_DELTCLASS:
2080                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2081                         /* Unbind the class with flilters with 0 */
2082                         tc_bind_tclass(q, portid, clid, 0);
2083                         goto out;
2084                 case RTM_GETTCLASS:
2085                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2086                         goto out;
2087                 default:
2088                         err = -EINVAL;
2089                         goto out;
2090                 }
2091         }
2092
2093         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2094                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2095                 return -EOPNOTSUPP;
2096         }
2097
2098         new_cl = cl;
2099         err = -EOPNOTSUPP;
2100         if (cops->change)
2101                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2102         if (err == 0) {
2103                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2104                 /* We just create a new class, need to do reverse binding. */
2105                 if (cl != new_cl)
2106                         tc_bind_tclass(q, portid, clid, new_cl);
2107         }
2108 out:
2109         return err;
2110 }
2111
2112 struct qdisc_dump_args {
2113         struct qdisc_walker     w;
2114         struct sk_buff          *skb;
2115         struct netlink_callback *cb;
2116 };
2117
2118 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2119                             struct qdisc_walker *arg)
2120 {
2121         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2122
2123         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2124                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2125                               RTM_NEWTCLASS);
2126 }
2127
2128 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2129                                 struct tcmsg *tcm, struct netlink_callback *cb,
2130                                 int *t_p, int s_t)
2131 {
2132         struct qdisc_dump_args arg;
2133
2134         if (tc_qdisc_dump_ignore(q, false) ||
2135             *t_p < s_t || !q->ops->cl_ops ||
2136             (tcm->tcm_parent &&
2137              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2138                 (*t_p)++;
2139                 return 0;
2140         }
2141         if (*t_p > s_t)
2142                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2143         arg.w.fn = qdisc_class_dump;
2144         arg.skb = skb;
2145         arg.cb = cb;
2146         arg.w.stop  = 0;
2147         arg.w.skip = cb->args[1];
2148         arg.w.count = 0;
2149         q->ops->cl_ops->walk(q, &arg.w);
2150         cb->args[1] = arg.w.count;
2151         if (arg.w.stop)
2152                 return -1;
2153         (*t_p)++;
2154         return 0;
2155 }
2156
2157 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2158                                struct tcmsg *tcm, struct netlink_callback *cb,
2159                                int *t_p, int s_t)
2160 {
2161         struct Qdisc *q;
2162         int b;
2163
2164         if (!root)
2165                 return 0;
2166
2167         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2168                 return -1;
2169
2170         if (!qdisc_dev(root))
2171                 return 0;
2172
2173         if (tcm->tcm_parent) {
2174                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2175                 if (q && q != root &&
2176                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2177                         return -1;
2178                 return 0;
2179         }
2180         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2181                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2182                         return -1;
2183         }
2184
2185         return 0;
2186 }
2187
2188 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2189 {
2190         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2191         struct net *net = sock_net(skb->sk);
2192         struct netdev_queue *dev_queue;
2193         struct net_device *dev;
2194         int t, s_t;
2195
2196         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2197                 return 0;
2198         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2199         if (!dev)
2200                 return 0;
2201
2202         s_t = cb->args[0];
2203         t = 0;
2204
2205         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2206                 goto done;
2207
2208         dev_queue = dev_ingress_queue(dev);
2209         if (dev_queue &&
2210             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2211                                 &t, s_t) < 0)
2212                 goto done;
2213
2214 done:
2215         cb->args[0] = t;
2216
2217         dev_put(dev);
2218         return skb->len;
2219 }
2220
2221 #ifdef CONFIG_PROC_FS
2222 static int psched_show(struct seq_file *seq, void *v)
2223 {
2224         seq_printf(seq, "%08x %08x %08x %08x\n",
2225                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2226                    1000000,
2227                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2228
2229         return 0;
2230 }
2231
2232 static int __net_init psched_net_init(struct net *net)
2233 {
2234         struct proc_dir_entry *e;
2235
2236         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2237         if (e == NULL)
2238                 return -ENOMEM;
2239
2240         return 0;
2241 }
2242
2243 static void __net_exit psched_net_exit(struct net *net)
2244 {
2245         remove_proc_entry("psched", net->proc_net);
2246 }
2247 #else
2248 static int __net_init psched_net_init(struct net *net)
2249 {
2250         return 0;
2251 }
2252
2253 static void __net_exit psched_net_exit(struct net *net)
2254 {
2255 }
2256 #endif
2257
2258 static struct pernet_operations psched_net_ops = {
2259         .init = psched_net_init,
2260         .exit = psched_net_exit,
2261 };
2262
2263 static int __init pktsched_init(void)
2264 {
2265         int err;
2266
2267         err = register_pernet_subsys(&psched_net_ops);
2268         if (err) {
2269                 pr_err("pktsched_init: "
2270                        "cannot initialize per netns operations\n");
2271                 return err;
2272         }
2273
2274         register_qdisc(&pfifo_fast_ops);
2275         register_qdisc(&pfifo_qdisc_ops);
2276         register_qdisc(&bfifo_qdisc_ops);
2277         register_qdisc(&pfifo_head_drop_qdisc_ops);
2278         register_qdisc(&mq_qdisc_ops);
2279         register_qdisc(&noqueue_qdisc_ops);
2280
2281         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2282         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2283         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2284                       0);
2285         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2286         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2287         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2288                       0);
2289
2290         return 0;
2291 }
2292
2293 subsys_initcall(pktsched_init);