]> asedeno.scripts.mit.edu Git - linux.git/blob - net/sched/sch_api.c
nvme: support traffic based keep-alive
[linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38
39 /*
40
41    Short review.
42    -------------
43
44    This file consists of two interrelated parts:
45
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67
68    All real intelligent work is done inside qdisc modules.
69
70
71
72    Every discipline has two major routines: enqueue and dequeue.
73
74    ---dequeue
75
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82
83    ---enqueue
84
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP        - this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92
93    Auxiliary routines:
94
95    ---peek
96
97    like dequeue but without removing a packet from the queue
98
99    ---reset
100
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103
104    ---init
105
106    initializes newly created qdisc.
107
108    ---destroy
109
110    destroys resources allocated by init and during lifetime of qdisc.
111
112    ---change
113
114    changes qdisc parameters.
115  */
116
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119
120
121 /************************************************
122  *      Queueing disciplines manipulation.      *
123  ************************************************/
124
125
126 /* The list of all installed queueing disciplines. */
127
128 static struct Qdisc_ops *qdisc_base;
129
130 /* Register/unregister queueing discipline */
131
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134         struct Qdisc_ops *q, **qp;
135         int rc = -EEXIST;
136
137         write_lock(&qdisc_mod_lock);
138         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139                 if (!strcmp(qops->id, q->id))
140                         goto out;
141
142         if (qops->enqueue == NULL)
143                 qops->enqueue = noop_qdisc_ops.enqueue;
144         if (qops->peek == NULL) {
145                 if (qops->dequeue == NULL)
146                         qops->peek = noop_qdisc_ops.peek;
147                 else
148                         goto out_einval;
149         }
150         if (qops->dequeue == NULL)
151                 qops->dequeue = noop_qdisc_ops.dequeue;
152
153         if (qops->cl_ops) {
154                 const struct Qdisc_class_ops *cops = qops->cl_ops;
155
156                 if (!(cops->find && cops->walk && cops->leaf))
157                         goto out_einval;
158
159                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160                         goto out_einval;
161         }
162
163         qops->next = NULL;
164         *qp = qops;
165         rc = 0;
166 out:
167         write_unlock(&qdisc_mod_lock);
168         return rc;
169
170 out_einval:
171         rc = -EINVAL;
172         goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178         struct Qdisc_ops *q, **qp;
179         int err = -ENOENT;
180
181         write_lock(&qdisc_mod_lock);
182         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183                 if (q == qops)
184                         break;
185         if (q) {
186                 *qp = q->next;
187                 q->next = NULL;
188                 err = 0;
189         }
190         write_unlock(&qdisc_mod_lock);
191         return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strlcpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(dev->qdisc, handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(dev->qdisc, handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         struct Qdisc *leaf;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         leaf = cops->leaf(p, cl);
348         return leaf;
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
485         if (err < 0)
486                 return ERR_PTR(err);
487         if (!tb[TCA_STAB_BASE]) {
488                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         s = nla_data(tb[TCA_STAB_BASE]);
493
494         if (s->tsize > 0) {
495                 if (!tb[TCA_STAB_DATA]) {
496                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
497                         return ERR_PTR(-EINVAL);
498                 }
499                 tab = nla_data(tb[TCA_STAB_DATA]);
500                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
501         }
502
503         if (tsize != s->tsize || (!tab && tsize > 0)) {
504                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
505                 return ERR_PTR(-EINVAL);
506         }
507
508         list_for_each_entry(stab, &qdisc_stab_list, list) {
509                 if (memcmp(&stab->szopts, s, sizeof(*s)))
510                         continue;
511                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
512                         continue;
513                 stab->refcnt++;
514                 return stab;
515         }
516
517         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
518         if (!stab)
519                 return ERR_PTR(-ENOMEM);
520
521         stab->refcnt = 1;
522         stab->szopts = *s;
523         if (tsize > 0)
524                 memcpy(stab->data, tab, tsize * sizeof(u16));
525
526         list_add_tail(&stab->list, &qdisc_stab_list);
527
528         return stab;
529 }
530
531 static void stab_kfree_rcu(struct rcu_head *head)
532 {
533         kfree(container_of(head, struct qdisc_size_table, rcu));
534 }
535
536 void qdisc_put_stab(struct qdisc_size_table *tab)
537 {
538         if (!tab)
539                 return;
540
541         if (--tab->refcnt == 0) {
542                 list_del(&tab->list);
543                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
544         }
545 }
546 EXPORT_SYMBOL(qdisc_put_stab);
547
548 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 {
550         struct nlattr *nest;
551
552         nest = nla_nest_start(skb, TCA_STAB);
553         if (nest == NULL)
554                 goto nla_put_failure;
555         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556                 goto nla_put_failure;
557         nla_nest_end(skb, nest);
558
559         return skb->len;
560
561 nla_put_failure:
562         return -1;
563 }
564
565 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566                                const struct qdisc_size_table *stab)
567 {
568         int pkt_len, slot;
569
570         pkt_len = skb->len + stab->szopts.overhead;
571         if (unlikely(!stab->szopts.tsize))
572                 goto out;
573
574         slot = pkt_len + stab->szopts.cell_align;
575         if (unlikely(slot < 0))
576                 slot = 0;
577
578         slot >>= stab->szopts.cell_log;
579         if (likely(slot < stab->szopts.tsize))
580                 pkt_len = stab->data[slot];
581         else
582                 pkt_len = stab->data[stab->szopts.tsize - 1] *
583                                 (slot / stab->szopts.tsize) +
584                                 stab->data[slot % stab->szopts.tsize];
585
586         pkt_len <<= stab->szopts.size_log;
587 out:
588         if (unlikely(pkt_len < 1))
589                 pkt_len = 1;
590         qdisc_skb_cb(skb)->pkt_len = pkt_len;
591 }
592 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595 {
596         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598                         txt, qdisc->ops->id, qdisc->handle >> 16);
599                 qdisc->flags |= TCQ_F_WARN_NONWC;
600         }
601 }
602 EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605 {
606         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607                                                  timer);
608
609         rcu_read_lock();
610         __netif_schedule(qdisc_root(wd->qdisc));
611         rcu_read_unlock();
612
613         return HRTIMER_NORESTART;
614 }
615
616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617                                  clockid_t clockid)
618 {
619         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620         wd->timer.function = qdisc_watchdog;
621         wd->qdisc = qdisc;
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626 {
627         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_init);
630
631 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
632 {
633         if (test_bit(__QDISC_STATE_DEACTIVATED,
634                      &qdisc_root_sleeping(wd->qdisc)->state))
635                 return;
636
637         if (wd->last_expires == expires)
638                 return;
639
640         wd->last_expires = expires;
641         hrtimer_start(&wd->timer,
642                       ns_to_ktime(expires),
643                       HRTIMER_MODE_ABS_PINNED);
644 }
645 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
646
647 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
648 {
649         hrtimer_cancel(&wd->timer);
650 }
651 EXPORT_SYMBOL(qdisc_watchdog_cancel);
652
653 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
654 {
655         struct hlist_head *h;
656         unsigned int i;
657
658         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
659
660         if (h != NULL) {
661                 for (i = 0; i < n; i++)
662                         INIT_HLIST_HEAD(&h[i]);
663         }
664         return h;
665 }
666
667 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
668 {
669         struct Qdisc_class_common *cl;
670         struct hlist_node *next;
671         struct hlist_head *nhash, *ohash;
672         unsigned int nsize, nmask, osize;
673         unsigned int i, h;
674
675         /* Rehash when load factor exceeds 0.75 */
676         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
677                 return;
678         nsize = clhash->hashsize * 2;
679         nmask = nsize - 1;
680         nhash = qdisc_class_hash_alloc(nsize);
681         if (nhash == NULL)
682                 return;
683
684         ohash = clhash->hash;
685         osize = clhash->hashsize;
686
687         sch_tree_lock(sch);
688         for (i = 0; i < osize; i++) {
689                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
690                         h = qdisc_class_hash(cl->classid, nmask);
691                         hlist_add_head(&cl->hnode, &nhash[h]);
692                 }
693         }
694         clhash->hash     = nhash;
695         clhash->hashsize = nsize;
696         clhash->hashmask = nmask;
697         sch_tree_unlock(sch);
698
699         kvfree(ohash);
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_grow);
702
703 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
704 {
705         unsigned int size = 4;
706
707         clhash->hash = qdisc_class_hash_alloc(size);
708         if (!clhash->hash)
709                 return -ENOMEM;
710         clhash->hashsize  = size;
711         clhash->hashmask  = size - 1;
712         clhash->hashelems = 0;
713         return 0;
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_init);
716
717 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
718 {
719         kvfree(clhash->hash);
720 }
721 EXPORT_SYMBOL(qdisc_class_hash_destroy);
722
723 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
724                              struct Qdisc_class_common *cl)
725 {
726         unsigned int h;
727
728         INIT_HLIST_NODE(&cl->hnode);
729         h = qdisc_class_hash(cl->classid, clhash->hashmask);
730         hlist_add_head(&cl->hnode, &clhash->hash[h]);
731         clhash->hashelems++;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_insert);
734
735 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
736                              struct Qdisc_class_common *cl)
737 {
738         hlist_del(&cl->hnode);
739         clhash->hashelems--;
740 }
741 EXPORT_SYMBOL(qdisc_class_hash_remove);
742
743 /* Allocate an unique handle from space managed by kernel
744  * Possible range is [8000-FFFF]:0000 (0x8000 values)
745  */
746 static u32 qdisc_alloc_handle(struct net_device *dev)
747 {
748         int i = 0x8000;
749         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
750
751         do {
752                 autohandle += TC_H_MAKE(0x10000U, 0);
753                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
754                         autohandle = TC_H_MAKE(0x80000000U, 0);
755                 if (!qdisc_lookup(dev, autohandle))
756                         return autohandle;
757                 cond_resched();
758         } while (--i > 0);
759
760         return 0;
761 }
762
763 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
764                                unsigned int len)
765 {
766         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
767         const struct Qdisc_class_ops *cops;
768         unsigned long cl;
769         u32 parentid;
770         bool notify;
771         int drops;
772
773         if (n == 0 && len == 0)
774                 return;
775         drops = max_t(int, n, 0);
776         rcu_read_lock();
777         while ((parentid = sch->parent)) {
778                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
779                         break;
780
781                 if (sch->flags & TCQ_F_NOPARENT)
782                         break;
783                 /* Notify parent qdisc only if child qdisc becomes empty.
784                  *
785                  * If child was empty even before update then backlog
786                  * counter is screwed and we skip notification because
787                  * parent class is already passive.
788                  *
789                  * If the original child was offloaded then it is allowed
790                  * to be seem as empty, so the parent is notified anyway.
791                  */
792                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
793                                                        !qdisc_is_offloaded);
794                 /* TODO: perform the search on a per txq basis */
795                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
796                 if (sch == NULL) {
797                         WARN_ON_ONCE(parentid != TC_H_ROOT);
798                         break;
799                 }
800                 cops = sch->ops->cl_ops;
801                 if (notify && cops->qlen_notify) {
802                         cl = cops->find(sch, parentid);
803                         cops->qlen_notify(sch, cl);
804                 }
805                 sch->q.qlen -= n;
806                 sch->qstats.backlog -= len;
807                 __qdisc_qstats_drop(sch, drops);
808         }
809         rcu_read_unlock();
810 }
811 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
812
813 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
814                          u32 portid, u32 seq, u16 flags, int event)
815 {
816         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
817         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
818         struct tcmsg *tcm;
819         struct nlmsghdr  *nlh;
820         unsigned char *b = skb_tail_pointer(skb);
821         struct gnet_dump d;
822         struct qdisc_size_table *stab;
823         u32 block_index;
824         __u32 qlen;
825
826         cond_resched();
827         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
828         if (!nlh)
829                 goto out_nlmsg_trim;
830         tcm = nlmsg_data(nlh);
831         tcm->tcm_family = AF_UNSPEC;
832         tcm->tcm__pad1 = 0;
833         tcm->tcm__pad2 = 0;
834         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
835         tcm->tcm_parent = clid;
836         tcm->tcm_handle = q->handle;
837         tcm->tcm_info = refcount_read(&q->refcnt);
838         if (nla_put_string(skb, TCA_KIND, q->ops->id))
839                 goto nla_put_failure;
840         if (q->ops->ingress_block_get) {
841                 block_index = q->ops->ingress_block_get(q);
842                 if (block_index &&
843                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
844                         goto nla_put_failure;
845         }
846         if (q->ops->egress_block_get) {
847                 block_index = q->ops->egress_block_get(q);
848                 if (block_index &&
849                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
850                         goto nla_put_failure;
851         }
852         if (q->ops->dump && q->ops->dump(q, skb) < 0)
853                 goto nla_put_failure;
854         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
855                 goto nla_put_failure;
856         qlen = qdisc_qlen_sum(q);
857
858         stab = rtnl_dereference(q->stab);
859         if (stab && qdisc_dump_stab(skb, stab) < 0)
860                 goto nla_put_failure;
861
862         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
863                                          NULL, &d, TCA_PAD) < 0)
864                 goto nla_put_failure;
865
866         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
867                 goto nla_put_failure;
868
869         if (qdisc_is_percpu_stats(q)) {
870                 cpu_bstats = q->cpu_bstats;
871                 cpu_qstats = q->cpu_qstats;
872         }
873
874         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
875                                   &d, cpu_bstats, &q->bstats) < 0 ||
876             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
877             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
878                 goto nla_put_failure;
879
880         if (gnet_stats_finish_copy(&d) < 0)
881                 goto nla_put_failure;
882
883         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
884         return skb->len;
885
886 out_nlmsg_trim:
887 nla_put_failure:
888         nlmsg_trim(skb, b);
889         return -1;
890 }
891
892 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
893 {
894         if (q->flags & TCQ_F_BUILTIN)
895                 return true;
896         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
897                 return true;
898
899         return false;
900 }
901
902 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
903                         struct nlmsghdr *n, u32 clid,
904                         struct Qdisc *old, struct Qdisc *new)
905 {
906         struct sk_buff *skb;
907         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
908
909         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
910         if (!skb)
911                 return -ENOBUFS;
912
913         if (old && !tc_qdisc_dump_ignore(old, false)) {
914                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
915                                   0, RTM_DELQDISC) < 0)
916                         goto err_out;
917         }
918         if (new && !tc_qdisc_dump_ignore(new, false)) {
919                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
920                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
921                         goto err_out;
922         }
923
924         if (skb->len)
925                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
926                                       n->nlmsg_flags & NLM_F_ECHO);
927
928 err_out:
929         kfree_skb(skb);
930         return -EINVAL;
931 }
932
933 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
934                                struct nlmsghdr *n, u32 clid,
935                                struct Qdisc *old, struct Qdisc *new)
936 {
937         if (new || old)
938                 qdisc_notify(net, skb, n, clid, old, new);
939
940         if (old)
941                 qdisc_put(old);
942 }
943
944 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
945  * to device "dev".
946  *
947  * When appropriate send a netlink notification using 'skb'
948  * and "n".
949  *
950  * On success, destroy old qdisc.
951  */
952
953 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
954                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
955                        struct Qdisc *new, struct Qdisc *old,
956                        struct netlink_ext_ack *extack)
957 {
958         struct Qdisc *q = old;
959         struct net *net = dev_net(dev);
960         int err = 0;
961
962         if (parent == NULL) {
963                 unsigned int i, num_q, ingress;
964
965                 ingress = 0;
966                 num_q = dev->num_tx_queues;
967                 if ((q && q->flags & TCQ_F_INGRESS) ||
968                     (new && new->flags & TCQ_F_INGRESS)) {
969                         num_q = 1;
970                         ingress = 1;
971                         if (!dev_ingress_queue(dev)) {
972                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
973                                 return -ENOENT;
974                         }
975                 }
976
977                 if (dev->flags & IFF_UP)
978                         dev_deactivate(dev);
979
980                 if (new && new->ops->attach)
981                         goto skip;
982
983                 for (i = 0; i < num_q; i++) {
984                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
985
986                         if (!ingress)
987                                 dev_queue = netdev_get_tx_queue(dev, i);
988
989                         old = dev_graft_qdisc(dev_queue, new);
990                         if (new && i > 0)
991                                 qdisc_refcount_inc(new);
992
993                         if (!ingress)
994                                 qdisc_put(old);
995                 }
996
997 skip:
998                 if (!ingress) {
999                         notify_and_destroy(net, skb, n, classid,
1000                                            dev->qdisc, new);
1001                         if (new && !new->ops->attach)
1002                                 qdisc_refcount_inc(new);
1003                         dev->qdisc = new ? : &noop_qdisc;
1004
1005                         if (new && new->ops->attach)
1006                                 new->ops->attach(new);
1007                 } else {
1008                         notify_and_destroy(net, skb, n, classid, old, new);
1009                 }
1010
1011                 if (dev->flags & IFF_UP)
1012                         dev_activate(dev);
1013         } else {
1014                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1015
1016                 /* Only support running class lockless if parent is lockless */
1017                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1018                     parent && !(parent->flags & TCQ_F_NOLOCK))
1019                         new->flags &= ~TCQ_F_NOLOCK;
1020
1021                 err = -EOPNOTSUPP;
1022                 if (cops && cops->graft) {
1023                         unsigned long cl = cops->find(parent, classid);
1024
1025                         if (cl) {
1026                                 err = cops->graft(parent, cl, new, &old,
1027                                                   extack);
1028                         } else {
1029                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1030                                 err = -ENOENT;
1031                         }
1032                 }
1033                 if (!err)
1034                         notify_and_destroy(net, skb, n, classid, old, new);
1035         }
1036         return err;
1037 }
1038
1039 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1040                                    struct netlink_ext_ack *extack)
1041 {
1042         u32 block_index;
1043
1044         if (tca[TCA_INGRESS_BLOCK]) {
1045                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1046
1047                 if (!block_index) {
1048                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1049                         return -EINVAL;
1050                 }
1051                 if (!sch->ops->ingress_block_set) {
1052                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1053                         return -EOPNOTSUPP;
1054                 }
1055                 sch->ops->ingress_block_set(sch, block_index);
1056         }
1057         if (tca[TCA_EGRESS_BLOCK]) {
1058                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1059
1060                 if (!block_index) {
1061                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1062                         return -EINVAL;
1063                 }
1064                 if (!sch->ops->egress_block_set) {
1065                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1066                         return -EOPNOTSUPP;
1067                 }
1068                 sch->ops->egress_block_set(sch, block_index);
1069         }
1070         return 0;
1071 }
1072
1073 /*
1074    Allocate and initialize new qdisc.
1075
1076    Parameters are passed via opt.
1077  */
1078
1079 static struct Qdisc *qdisc_create(struct net_device *dev,
1080                                   struct netdev_queue *dev_queue,
1081                                   struct Qdisc *p, u32 parent, u32 handle,
1082                                   struct nlattr **tca, int *errp,
1083                                   struct netlink_ext_ack *extack)
1084 {
1085         int err;
1086         struct nlattr *kind = tca[TCA_KIND];
1087         struct Qdisc *sch;
1088         struct Qdisc_ops *ops;
1089         struct qdisc_size_table *stab;
1090
1091         ops = qdisc_lookup_ops(kind);
1092 #ifdef CONFIG_MODULES
1093         if (ops == NULL && kind != NULL) {
1094                 char name[IFNAMSIZ];
1095                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1096                         /* We dropped the RTNL semaphore in order to
1097                          * perform the module load.  So, even if we
1098                          * succeeded in loading the module we have to
1099                          * tell the caller to replay the request.  We
1100                          * indicate this using -EAGAIN.
1101                          * We replay the request because the device may
1102                          * go away in the mean time.
1103                          */
1104                         rtnl_unlock();
1105                         request_module("sch_%s", name);
1106                         rtnl_lock();
1107                         ops = qdisc_lookup_ops(kind);
1108                         if (ops != NULL) {
1109                                 /* We will try again qdisc_lookup_ops,
1110                                  * so don't keep a reference.
1111                                  */
1112                                 module_put(ops->owner);
1113                                 err = -EAGAIN;
1114                                 goto err_out;
1115                         }
1116                 }
1117         }
1118 #endif
1119
1120         err = -ENOENT;
1121         if (!ops) {
1122                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1123                 goto err_out;
1124         }
1125
1126         sch = qdisc_alloc(dev_queue, ops, extack);
1127         if (IS_ERR(sch)) {
1128                 err = PTR_ERR(sch);
1129                 goto err_out2;
1130         }
1131
1132         sch->parent = parent;
1133
1134         if (handle == TC_H_INGRESS) {
1135                 sch->flags |= TCQ_F_INGRESS;
1136                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1137         } else {
1138                 if (handle == 0) {
1139                         handle = qdisc_alloc_handle(dev);
1140                         err = -ENOMEM;
1141                         if (handle == 0)
1142                                 goto err_out3;
1143                 }
1144                 if (!netif_is_multiqueue(dev))
1145                         sch->flags |= TCQ_F_ONETXQUEUE;
1146         }
1147
1148         sch->handle = handle;
1149
1150         /* This exist to keep backward compatible with a userspace
1151          * loophole, what allowed userspace to get IFF_NO_QUEUE
1152          * facility on older kernels by setting tx_queue_len=0 (prior
1153          * to qdisc init), and then forgot to reinit tx_queue_len
1154          * before again attaching a qdisc.
1155          */
1156         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1157                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1158                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1159         }
1160
1161         err = qdisc_block_indexes_set(sch, tca, extack);
1162         if (err)
1163                 goto err_out3;
1164
1165         if (ops->init) {
1166                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1167                 if (err != 0)
1168                         goto err_out5;
1169         }
1170
1171         if (tca[TCA_STAB]) {
1172                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1173                 if (IS_ERR(stab)) {
1174                         err = PTR_ERR(stab);
1175                         goto err_out4;
1176                 }
1177                 rcu_assign_pointer(sch->stab, stab);
1178         }
1179         if (tca[TCA_RATE]) {
1180                 seqcount_t *running;
1181
1182                 err = -EOPNOTSUPP;
1183                 if (sch->flags & TCQ_F_MQROOT) {
1184                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1185                         goto err_out4;
1186                 }
1187
1188                 if (sch->parent != TC_H_ROOT &&
1189                     !(sch->flags & TCQ_F_INGRESS) &&
1190                     (!p || !(p->flags & TCQ_F_MQROOT)))
1191                         running = qdisc_root_sleeping_running(sch);
1192                 else
1193                         running = &sch->running;
1194
1195                 err = gen_new_estimator(&sch->bstats,
1196                                         sch->cpu_bstats,
1197                                         &sch->rate_est,
1198                                         NULL,
1199                                         running,
1200                                         tca[TCA_RATE]);
1201                 if (err) {
1202                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1203                         goto err_out4;
1204                 }
1205         }
1206
1207         qdisc_hash_add(sch, false);
1208
1209         return sch;
1210
1211 err_out5:
1212         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1213         if (ops->destroy)
1214                 ops->destroy(sch);
1215 err_out3:
1216         dev_put(dev);
1217         qdisc_free(sch);
1218 err_out2:
1219         module_put(ops->owner);
1220 err_out:
1221         *errp = err;
1222         return NULL;
1223
1224 err_out4:
1225         /*
1226          * Any broken qdiscs that would require a ops->reset() here?
1227          * The qdisc was never in action so it shouldn't be necessary.
1228          */
1229         qdisc_put_stab(rtnl_dereference(sch->stab));
1230         if (ops->destroy)
1231                 ops->destroy(sch);
1232         goto err_out3;
1233 }
1234
1235 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1236                         struct netlink_ext_ack *extack)
1237 {
1238         struct qdisc_size_table *ostab, *stab = NULL;
1239         int err = 0;
1240
1241         if (tca[TCA_OPTIONS]) {
1242                 if (!sch->ops->change) {
1243                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1244                         return -EINVAL;
1245                 }
1246                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1247                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1248                         return -EOPNOTSUPP;
1249                 }
1250                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1251                 if (err)
1252                         return err;
1253         }
1254
1255         if (tca[TCA_STAB]) {
1256                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1257                 if (IS_ERR(stab))
1258                         return PTR_ERR(stab);
1259         }
1260
1261         ostab = rtnl_dereference(sch->stab);
1262         rcu_assign_pointer(sch->stab, stab);
1263         qdisc_put_stab(ostab);
1264
1265         if (tca[TCA_RATE]) {
1266                 /* NB: ignores errors from replace_estimator
1267                    because change can't be undone. */
1268                 if (sch->flags & TCQ_F_MQROOT)
1269                         goto out;
1270                 gen_replace_estimator(&sch->bstats,
1271                                       sch->cpu_bstats,
1272                                       &sch->rate_est,
1273                                       NULL,
1274                                       qdisc_root_sleeping_running(sch),
1275                                       tca[TCA_RATE]);
1276         }
1277 out:
1278         return 0;
1279 }
1280
1281 struct check_loop_arg {
1282         struct qdisc_walker     w;
1283         struct Qdisc            *p;
1284         int                     depth;
1285 };
1286
1287 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1288                          struct qdisc_walker *w);
1289
1290 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1291 {
1292         struct check_loop_arg   arg;
1293
1294         if (q->ops->cl_ops == NULL)
1295                 return 0;
1296
1297         arg.w.stop = arg.w.skip = arg.w.count = 0;
1298         arg.w.fn = check_loop_fn;
1299         arg.depth = depth;
1300         arg.p = p;
1301         q->ops->cl_ops->walk(q, &arg.w);
1302         return arg.w.stop ? -ELOOP : 0;
1303 }
1304
1305 static int
1306 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1307 {
1308         struct Qdisc *leaf;
1309         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1310         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1311
1312         leaf = cops->leaf(q, cl);
1313         if (leaf) {
1314                 if (leaf == arg->p || arg->depth > 7)
1315                         return -ELOOP;
1316                 return check_loop(leaf, arg->p, arg->depth + 1);
1317         }
1318         return 0;
1319 }
1320
1321 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1322         [TCA_KIND]              = { .type = NLA_STRING },
1323         [TCA_RATE]              = { .type = NLA_BINARY,
1324                                     .len = sizeof(struct tc_estimator) },
1325         [TCA_STAB]              = { .type = NLA_NESTED },
1326         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1327         [TCA_CHAIN]             = { .type = NLA_U32 },
1328         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1329         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1330 };
1331
1332 /*
1333  * Delete/get qdisc.
1334  */
1335
1336 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1337                         struct netlink_ext_ack *extack)
1338 {
1339         struct net *net = sock_net(skb->sk);
1340         struct tcmsg *tcm = nlmsg_data(n);
1341         struct nlattr *tca[TCA_MAX + 1];
1342         struct net_device *dev;
1343         u32 clid;
1344         struct Qdisc *q = NULL;
1345         struct Qdisc *p = NULL;
1346         int err;
1347
1348         if ((n->nlmsg_type != RTM_GETQDISC) &&
1349             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1350                 return -EPERM;
1351
1352         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1353                           extack);
1354         if (err < 0)
1355                 return err;
1356
1357         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1358         if (!dev)
1359                 return -ENODEV;
1360
1361         clid = tcm->tcm_parent;
1362         if (clid) {
1363                 if (clid != TC_H_ROOT) {
1364                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1365                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1366                                 if (!p) {
1367                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1368                                         return -ENOENT;
1369                                 }
1370                                 q = qdisc_leaf(p, clid);
1371                         } else if (dev_ingress_queue(dev)) {
1372                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1373                         }
1374                 } else {
1375                         q = dev->qdisc;
1376                 }
1377                 if (!q) {
1378                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1379                         return -ENOENT;
1380                 }
1381
1382                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1383                         NL_SET_ERR_MSG(extack, "Invalid handle");
1384                         return -EINVAL;
1385                 }
1386         } else {
1387                 q = qdisc_lookup(dev, tcm->tcm_handle);
1388                 if (!q) {
1389                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1390                         return -ENOENT;
1391                 }
1392         }
1393
1394         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1395                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1396                 return -EINVAL;
1397         }
1398
1399         if (n->nlmsg_type == RTM_DELQDISC) {
1400                 if (!clid) {
1401                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1402                         return -EINVAL;
1403                 }
1404                 if (q->handle == 0) {
1405                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1406                         return -ENOENT;
1407                 }
1408                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1409                 if (err != 0)
1410                         return err;
1411         } else {
1412                 qdisc_notify(net, skb, n, clid, NULL, q);
1413         }
1414         return 0;
1415 }
1416
1417 /*
1418  * Create/change qdisc.
1419  */
1420
1421 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1422                            struct netlink_ext_ack *extack)
1423 {
1424         struct net *net = sock_net(skb->sk);
1425         struct tcmsg *tcm;
1426         struct nlattr *tca[TCA_MAX + 1];
1427         struct net_device *dev;
1428         u32 clid;
1429         struct Qdisc *q, *p;
1430         int err;
1431
1432         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1433                 return -EPERM;
1434
1435 replay:
1436         /* Reinit, just in case something touches this. */
1437         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1438                           extack);
1439         if (err < 0)
1440                 return err;
1441
1442         tcm = nlmsg_data(n);
1443         clid = tcm->tcm_parent;
1444         q = p = NULL;
1445
1446         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1447         if (!dev)
1448                 return -ENODEV;
1449
1450
1451         if (clid) {
1452                 if (clid != TC_H_ROOT) {
1453                         if (clid != TC_H_INGRESS) {
1454                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1455                                 if (!p) {
1456                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1457                                         return -ENOENT;
1458                                 }
1459                                 q = qdisc_leaf(p, clid);
1460                         } else if (dev_ingress_queue_create(dev)) {
1461                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1462                         }
1463                 } else {
1464                         q = dev->qdisc;
1465                 }
1466
1467                 /* It may be default qdisc, ignore it */
1468                 if (q && q->handle == 0)
1469                         q = NULL;
1470
1471                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1472                         if (tcm->tcm_handle) {
1473                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1474                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1475                                         return -EEXIST;
1476                                 }
1477                                 if (TC_H_MIN(tcm->tcm_handle)) {
1478                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1479                                         return -EINVAL;
1480                                 }
1481                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1482                                 if (!q)
1483                                         goto create_n_graft;
1484                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1485                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1486                                         return -EEXIST;
1487                                 }
1488                                 if (tca[TCA_KIND] &&
1489                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1490                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1491                                         return -EINVAL;
1492                                 }
1493                                 if (q == p ||
1494                                     (p && check_loop(q, p, 0))) {
1495                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1496                                         return -ELOOP;
1497                                 }
1498                                 qdisc_refcount_inc(q);
1499                                 goto graft;
1500                         } else {
1501                                 if (!q)
1502                                         goto create_n_graft;
1503
1504                                 /* This magic test requires explanation.
1505                                  *
1506                                  *   We know, that some child q is already
1507                                  *   attached to this parent and have choice:
1508                                  *   either to change it or to create/graft new one.
1509                                  *
1510                                  *   1. We are allowed to create/graft only
1511                                  *   if CREATE and REPLACE flags are set.
1512                                  *
1513                                  *   2. If EXCL is set, requestor wanted to say,
1514                                  *   that qdisc tcm_handle is not expected
1515                                  *   to exist, so that we choose create/graft too.
1516                                  *
1517                                  *   3. The last case is when no flags are set.
1518                                  *   Alas, it is sort of hole in API, we
1519                                  *   cannot decide what to do unambiguously.
1520                                  *   For now we select create/graft, if
1521                                  *   user gave KIND, which does not match existing.
1522                                  */
1523                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1524                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1525                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1526                                      (tca[TCA_KIND] &&
1527                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1528                                         goto create_n_graft;
1529                         }
1530                 }
1531         } else {
1532                 if (!tcm->tcm_handle) {
1533                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1534                         return -EINVAL;
1535                 }
1536                 q = qdisc_lookup(dev, tcm->tcm_handle);
1537         }
1538
1539         /* Change qdisc parameters */
1540         if (!q) {
1541                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1542                 return -ENOENT;
1543         }
1544         if (n->nlmsg_flags & NLM_F_EXCL) {
1545                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1546                 return -EEXIST;
1547         }
1548         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1549                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1550                 return -EINVAL;
1551         }
1552         err = qdisc_change(q, tca, extack);
1553         if (err == 0)
1554                 qdisc_notify(net, skb, n, clid, NULL, q);
1555         return err;
1556
1557 create_n_graft:
1558         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1559                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1560                 return -ENOENT;
1561         }
1562         if (clid == TC_H_INGRESS) {
1563                 if (dev_ingress_queue(dev)) {
1564                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1565                                          tcm->tcm_parent, tcm->tcm_parent,
1566                                          tca, &err, extack);
1567                 } else {
1568                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1569                         err = -ENOENT;
1570                 }
1571         } else {
1572                 struct netdev_queue *dev_queue;
1573
1574                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1575                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1576                 else if (p)
1577                         dev_queue = p->dev_queue;
1578                 else
1579                         dev_queue = netdev_get_tx_queue(dev, 0);
1580
1581                 q = qdisc_create(dev, dev_queue, p,
1582                                  tcm->tcm_parent, tcm->tcm_handle,
1583                                  tca, &err, extack);
1584         }
1585         if (q == NULL) {
1586                 if (err == -EAGAIN)
1587                         goto replay;
1588                 return err;
1589         }
1590
1591 graft:
1592         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1593         if (err) {
1594                 if (q)
1595                         qdisc_put(q);
1596                 return err;
1597         }
1598
1599         return 0;
1600 }
1601
1602 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1603                               struct netlink_callback *cb,
1604                               int *q_idx_p, int s_q_idx, bool recur,
1605                               bool dump_invisible)
1606 {
1607         int ret = 0, q_idx = *q_idx_p;
1608         struct Qdisc *q;
1609         int b;
1610
1611         if (!root)
1612                 return 0;
1613
1614         q = root;
1615         if (q_idx < s_q_idx) {
1616                 q_idx++;
1617         } else {
1618                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1619                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1620                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1621                                   RTM_NEWQDISC) <= 0)
1622                         goto done;
1623                 q_idx++;
1624         }
1625
1626         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1627          * itself has already been dumped.
1628          *
1629          * If we've already dumped the top-level (ingress) qdisc above and the global
1630          * qdisc hashtable, we don't want to hit it again
1631          */
1632         if (!qdisc_dev(root) || !recur)
1633                 goto out;
1634
1635         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1636                 if (q_idx < s_q_idx) {
1637                         q_idx++;
1638                         continue;
1639                 }
1640                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1641                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1642                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1643                                   RTM_NEWQDISC) <= 0)
1644                         goto done;
1645                 q_idx++;
1646         }
1647
1648 out:
1649         *q_idx_p = q_idx;
1650         return ret;
1651 done:
1652         ret = -1;
1653         goto out;
1654 }
1655
1656 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1657 {
1658         struct net *net = sock_net(skb->sk);
1659         int idx, q_idx;
1660         int s_idx, s_q_idx;
1661         struct net_device *dev;
1662         const struct nlmsghdr *nlh = cb->nlh;
1663         struct nlattr *tca[TCA_MAX + 1];
1664         int err;
1665
1666         s_idx = cb->args[0];
1667         s_q_idx = q_idx = cb->args[1];
1668
1669         idx = 0;
1670         ASSERT_RTNL();
1671
1672         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1673                           rtm_tca_policy, cb->extack);
1674         if (err < 0)
1675                 return err;
1676
1677         for_each_netdev(net, dev) {
1678                 struct netdev_queue *dev_queue;
1679
1680                 if (idx < s_idx)
1681                         goto cont;
1682                 if (idx > s_idx)
1683                         s_q_idx = 0;
1684                 q_idx = 0;
1685
1686                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1687                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1688                         goto done;
1689
1690                 dev_queue = dev_ingress_queue(dev);
1691                 if (dev_queue &&
1692                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1693                                        &q_idx, s_q_idx, false,
1694                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1695                         goto done;
1696
1697 cont:
1698                 idx++;
1699         }
1700
1701 done:
1702         cb->args[0] = idx;
1703         cb->args[1] = q_idx;
1704
1705         return skb->len;
1706 }
1707
1708
1709
1710 /************************************************
1711  *      Traffic classes manipulation.           *
1712  ************************************************/
1713
1714 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1715                           unsigned long cl,
1716                           u32 portid, u32 seq, u16 flags, int event)
1717 {
1718         struct tcmsg *tcm;
1719         struct nlmsghdr  *nlh;
1720         unsigned char *b = skb_tail_pointer(skb);
1721         struct gnet_dump d;
1722         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1723
1724         cond_resched();
1725         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1726         if (!nlh)
1727                 goto out_nlmsg_trim;
1728         tcm = nlmsg_data(nlh);
1729         tcm->tcm_family = AF_UNSPEC;
1730         tcm->tcm__pad1 = 0;
1731         tcm->tcm__pad2 = 0;
1732         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1733         tcm->tcm_parent = q->handle;
1734         tcm->tcm_handle = q->handle;
1735         tcm->tcm_info = 0;
1736         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1737                 goto nla_put_failure;
1738         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1739                 goto nla_put_failure;
1740
1741         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1742                                          NULL, &d, TCA_PAD) < 0)
1743                 goto nla_put_failure;
1744
1745         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1746                 goto nla_put_failure;
1747
1748         if (gnet_stats_finish_copy(&d) < 0)
1749                 goto nla_put_failure;
1750
1751         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1752         return skb->len;
1753
1754 out_nlmsg_trim:
1755 nla_put_failure:
1756         nlmsg_trim(skb, b);
1757         return -1;
1758 }
1759
1760 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1761                          struct nlmsghdr *n, struct Qdisc *q,
1762                          unsigned long cl, int event)
1763 {
1764         struct sk_buff *skb;
1765         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1766
1767         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1768         if (!skb)
1769                 return -ENOBUFS;
1770
1771         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1772                 kfree_skb(skb);
1773                 return -EINVAL;
1774         }
1775
1776         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1777                               n->nlmsg_flags & NLM_F_ECHO);
1778 }
1779
1780 static int tclass_del_notify(struct net *net,
1781                              const struct Qdisc_class_ops *cops,
1782                              struct sk_buff *oskb, struct nlmsghdr *n,
1783                              struct Qdisc *q, unsigned long cl)
1784 {
1785         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1786         struct sk_buff *skb;
1787         int err = 0;
1788
1789         if (!cops->delete)
1790                 return -EOPNOTSUPP;
1791
1792         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1793         if (!skb)
1794                 return -ENOBUFS;
1795
1796         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1797                            RTM_DELTCLASS) < 0) {
1798                 kfree_skb(skb);
1799                 return -EINVAL;
1800         }
1801
1802         err = cops->delete(q, cl);
1803         if (err) {
1804                 kfree_skb(skb);
1805                 return err;
1806         }
1807
1808         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1809                               n->nlmsg_flags & NLM_F_ECHO);
1810 }
1811
1812 #ifdef CONFIG_NET_CLS
1813
1814 struct tcf_bind_args {
1815         struct tcf_walker w;
1816         u32 classid;
1817         unsigned long cl;
1818 };
1819
1820 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1821 {
1822         struct tcf_bind_args *a = (void *)arg;
1823
1824         if (tp->ops->bind_class) {
1825                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1826
1827                 sch_tree_lock(q);
1828                 tp->ops->bind_class(n, a->classid, a->cl);
1829                 sch_tree_unlock(q);
1830         }
1831         return 0;
1832 }
1833
1834 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1835                            unsigned long new_cl)
1836 {
1837         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1838         struct tcf_block *block;
1839         struct tcf_chain *chain;
1840         unsigned long cl;
1841
1842         cl = cops->find(q, portid);
1843         if (!cl)
1844                 return;
1845         block = cops->tcf_block(q, cl, NULL);
1846         if (!block)
1847                 return;
1848         list_for_each_entry(chain, &block->chain_list, list) {
1849                 struct tcf_proto *tp;
1850
1851                 for (tp = rtnl_dereference(chain->filter_chain);
1852                      tp; tp = rtnl_dereference(tp->next)) {
1853                         struct tcf_bind_args arg = {};
1854
1855                         arg.w.fn = tcf_node_bind;
1856                         arg.classid = clid;
1857                         arg.cl = new_cl;
1858                         tp->ops->walk(tp, &arg.w);
1859                 }
1860         }
1861 }
1862
1863 #else
1864
1865 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1866                            unsigned long new_cl)
1867 {
1868 }
1869
1870 #endif
1871
1872 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1873                          struct netlink_ext_ack *extack)
1874 {
1875         struct net *net = sock_net(skb->sk);
1876         struct tcmsg *tcm = nlmsg_data(n);
1877         struct nlattr *tca[TCA_MAX + 1];
1878         struct net_device *dev;
1879         struct Qdisc *q = NULL;
1880         const struct Qdisc_class_ops *cops;
1881         unsigned long cl = 0;
1882         unsigned long new_cl;
1883         u32 portid;
1884         u32 clid;
1885         u32 qid;
1886         int err;
1887
1888         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1889             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1890                 return -EPERM;
1891
1892         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1893                           extack);
1894         if (err < 0)
1895                 return err;
1896
1897         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1898         if (!dev)
1899                 return -ENODEV;
1900
1901         /*
1902            parent == TC_H_UNSPEC - unspecified parent.
1903            parent == TC_H_ROOT   - class is root, which has no parent.
1904            parent == X:0         - parent is root class.
1905            parent == X:Y         - parent is a node in hierarchy.
1906            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1907
1908            handle == 0:0         - generate handle from kernel pool.
1909            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1910            handle == X:Y         - clear.
1911            handle == X:0         - root class.
1912          */
1913
1914         /* Step 1. Determine qdisc handle X:0 */
1915
1916         portid = tcm->tcm_parent;
1917         clid = tcm->tcm_handle;
1918         qid = TC_H_MAJ(clid);
1919
1920         if (portid != TC_H_ROOT) {
1921                 u32 qid1 = TC_H_MAJ(portid);
1922
1923                 if (qid && qid1) {
1924                         /* If both majors are known, they must be identical. */
1925                         if (qid != qid1)
1926                                 return -EINVAL;
1927                 } else if (qid1) {
1928                         qid = qid1;
1929                 } else if (qid == 0)
1930                         qid = dev->qdisc->handle;
1931
1932                 /* Now qid is genuine qdisc handle consistent
1933                  * both with parent and child.
1934                  *
1935                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1936                  */
1937                 if (portid)
1938                         portid = TC_H_MAKE(qid, portid);
1939         } else {
1940                 if (qid == 0)
1941                         qid = dev->qdisc->handle;
1942         }
1943
1944         /* OK. Locate qdisc */
1945         q = qdisc_lookup(dev, qid);
1946         if (!q)
1947                 return -ENOENT;
1948
1949         /* An check that it supports classes */
1950         cops = q->ops->cl_ops;
1951         if (cops == NULL)
1952                 return -EINVAL;
1953
1954         /* Now try to get class */
1955         if (clid == 0) {
1956                 if (portid == TC_H_ROOT)
1957                         clid = qid;
1958         } else
1959                 clid = TC_H_MAKE(qid, clid);
1960
1961         if (clid)
1962                 cl = cops->find(q, clid);
1963
1964         if (cl == 0) {
1965                 err = -ENOENT;
1966                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1967                     !(n->nlmsg_flags & NLM_F_CREATE))
1968                         goto out;
1969         } else {
1970                 switch (n->nlmsg_type) {
1971                 case RTM_NEWTCLASS:
1972                         err = -EEXIST;
1973                         if (n->nlmsg_flags & NLM_F_EXCL)
1974                                 goto out;
1975                         break;
1976                 case RTM_DELTCLASS:
1977                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1978                         /* Unbind the class with flilters with 0 */
1979                         tc_bind_tclass(q, portid, clid, 0);
1980                         goto out;
1981                 case RTM_GETTCLASS:
1982                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1983                         goto out;
1984                 default:
1985                         err = -EINVAL;
1986                         goto out;
1987                 }
1988         }
1989
1990         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1991                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1992                 return -EOPNOTSUPP;
1993         }
1994
1995         new_cl = cl;
1996         err = -EOPNOTSUPP;
1997         if (cops->change)
1998                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1999         if (err == 0) {
2000                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2001                 /* We just create a new class, need to do reverse binding. */
2002                 if (cl != new_cl)
2003                         tc_bind_tclass(q, portid, clid, new_cl);
2004         }
2005 out:
2006         return err;
2007 }
2008
2009 struct qdisc_dump_args {
2010         struct qdisc_walker     w;
2011         struct sk_buff          *skb;
2012         struct netlink_callback *cb;
2013 };
2014
2015 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2016                             struct qdisc_walker *arg)
2017 {
2018         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2019
2020         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2021                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2022                               RTM_NEWTCLASS);
2023 }
2024
2025 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2026                                 struct tcmsg *tcm, struct netlink_callback *cb,
2027                                 int *t_p, int s_t)
2028 {
2029         struct qdisc_dump_args arg;
2030
2031         if (tc_qdisc_dump_ignore(q, false) ||
2032             *t_p < s_t || !q->ops->cl_ops ||
2033             (tcm->tcm_parent &&
2034              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2035                 (*t_p)++;
2036                 return 0;
2037         }
2038         if (*t_p > s_t)
2039                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2040         arg.w.fn = qdisc_class_dump;
2041         arg.skb = skb;
2042         arg.cb = cb;
2043         arg.w.stop  = 0;
2044         arg.w.skip = cb->args[1];
2045         arg.w.count = 0;
2046         q->ops->cl_ops->walk(q, &arg.w);
2047         cb->args[1] = arg.w.count;
2048         if (arg.w.stop)
2049                 return -1;
2050         (*t_p)++;
2051         return 0;
2052 }
2053
2054 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2055                                struct tcmsg *tcm, struct netlink_callback *cb,
2056                                int *t_p, int s_t)
2057 {
2058         struct Qdisc *q;
2059         int b;
2060
2061         if (!root)
2062                 return 0;
2063
2064         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2065                 return -1;
2066
2067         if (!qdisc_dev(root))
2068                 return 0;
2069
2070         if (tcm->tcm_parent) {
2071                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2072                 if (q && q != root &&
2073                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2074                         return -1;
2075                 return 0;
2076         }
2077         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2078                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2079                         return -1;
2080         }
2081
2082         return 0;
2083 }
2084
2085 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2086 {
2087         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2088         struct net *net = sock_net(skb->sk);
2089         struct netdev_queue *dev_queue;
2090         struct net_device *dev;
2091         int t, s_t;
2092
2093         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2094                 return 0;
2095         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2096         if (!dev)
2097                 return 0;
2098
2099         s_t = cb->args[0];
2100         t = 0;
2101
2102         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2103                 goto done;
2104
2105         dev_queue = dev_ingress_queue(dev);
2106         if (dev_queue &&
2107             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2108                                 &t, s_t) < 0)
2109                 goto done;
2110
2111 done:
2112         cb->args[0] = t;
2113
2114         dev_put(dev);
2115         return skb->len;
2116 }
2117
2118 #ifdef CONFIG_PROC_FS
2119 static int psched_show(struct seq_file *seq, void *v)
2120 {
2121         seq_printf(seq, "%08x %08x %08x %08x\n",
2122                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2123                    1000000,
2124                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2125
2126         return 0;
2127 }
2128
2129 static int __net_init psched_net_init(struct net *net)
2130 {
2131         struct proc_dir_entry *e;
2132
2133         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2134         if (e == NULL)
2135                 return -ENOMEM;
2136
2137         return 0;
2138 }
2139
2140 static void __net_exit psched_net_exit(struct net *net)
2141 {
2142         remove_proc_entry("psched", net->proc_net);
2143 }
2144 #else
2145 static int __net_init psched_net_init(struct net *net)
2146 {
2147         return 0;
2148 }
2149
2150 static void __net_exit psched_net_exit(struct net *net)
2151 {
2152 }
2153 #endif
2154
2155 static struct pernet_operations psched_net_ops = {
2156         .init = psched_net_init,
2157         .exit = psched_net_exit,
2158 };
2159
2160 static int __init pktsched_init(void)
2161 {
2162         int err;
2163
2164         err = register_pernet_subsys(&psched_net_ops);
2165         if (err) {
2166                 pr_err("pktsched_init: "
2167                        "cannot initialize per netns operations\n");
2168                 return err;
2169         }
2170
2171         register_qdisc(&pfifo_fast_ops);
2172         register_qdisc(&pfifo_qdisc_ops);
2173         register_qdisc(&bfifo_qdisc_ops);
2174         register_qdisc(&pfifo_head_drop_qdisc_ops);
2175         register_qdisc(&mq_qdisc_ops);
2176         register_qdisc(&noqueue_qdisc_ops);
2177
2178         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2179         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2180         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2181                       0);
2182         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2183         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2184         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2185                       0);
2186
2187         return 0;
2188 }
2189
2190 subsys_initcall(pktsched_init);