net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/string.h>
  19 #include <linux/errno.h>
  20 #include <linux/skbuff.h>
  21 #include <net/netlink.h>
  22 #include <net/sch_generic.h>
  23 #include <net/pkt_sched.h>
  24
  25
  26 /*      Simple Token Bucket Filter.
  27         =======================================
  28
  29         SOURCE.
  30         -------
  31
  32         None.
  33
  34         Description.
  35         ------------
  36
  37         A data flow obeys TBF with rate R and depth B, if for any
  38         time interval t_i...t_f the number of transmitted bits
  39         does not exceed B + R*(t_f-t_i).
  40
  41         Packetized version of this definition:
  42         The sequence of packets of sizes s_i served at moments t_i
  43         obeys TBF, if for any i<=k:
  44
  45         s_i+....+s_k <= B + R*(t_k - t_i)
  46
  47         Algorithm.
  48         ----------
  49
  50         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  51
  52         N(t+delta) = min{B/R, N(t) + delta}
  53
  54         If the first packet in queue has length S, it may be
  55         transmitted only at the time t_* when S/R <= N(t_*),
  56         and in this case N(t) jumps:
  57
  58         N(t_* + 0) = N(t_* - 0) - S/R.
  59
  60
  61
  62         Actually, QoS requires two TBF to be applied to a data stream.
  63         One of them controls steady state burst size, another
  64         one with rate P (peak rate) and depth M (equal to link MTU)
  65         limits bursts at a smaller time scale.
  66
  67         It is easy to see that P>R, and B>M. If P is infinity, this double
  68         TBF is equivalent to a single one.
  69
  70         When TBF works in reshaping mode, latency is estimated as:
  71
  72         lat = max ((L-B)/R, (L-M)/P)
  73
  74
  75         NOTES.
  76         ------
  77
  78         If TBF throttles, it starts a watchdog timer, which will wake it up
  79         when it is ready to transmit.
  80         Note that the minimal timer resolution is 1/HZ.
  81         If no new packets arrive during this period,
  82         or if the device is not awaken by EOI for some previous packet,
  83         TBF can stop its activity for 1/HZ.
  84
  85
  86         This means, that with depth B, the maximal rate is
  87
  88         R_crit = B*HZ
  89
  90         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  91
  92         Note that the peak rate TBF is much more tough: with MTU 1500
  93         P_crit = 150Kbytes/sec. So, if you need greater peak
  94         rates, use alpha with HZ=1000 :-)
  95
  96         With classful TBF, limit is just kept for backwards compatibility.
  97         It is passed to the default bfifo qdisc - if the inner qdisc is
  98         changed the limit is not effective anymore.
  99 */
 100
 101 struct tbf_sched_data {
 102 /* Parameters */
 103         u32             limit;          /* Maximal length of backlog: bytes */
 104         u32             max_size;
 105         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 106         s64             mtu;
 107         struct psched_ratecfg rate;
 108         struct psched_ratecfg peak;
 109
 110 /* Variables */
 111         s64     tokens;                 /* Current number of B tokens */
 112         s64     ptokens;                /* Current number of P tokens */
 113         s64     t_c;                    /* Time check-point */
 114         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 115         struct qdisc_watchdog watchdog; /* Watchdog timer */
 116 };
 117
 118
 119 /* Time to Length, convert time in ns to length in bytes
 120  * to determinate how many bytes can be sent in given time.
 121  */
 122 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 123                          u64 time_in_ns)
 124 {
 125         /* The formula is :
 126          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 127          */
 128         u64 len = time_in_ns * r->rate_bytes_ps;
 129
 130         do_div(len, NSEC_PER_SEC);
 131
 132         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 133                 do_div(len, 53);
 134                 len = len * 48;
 135         }
 136
 137         if (len > r->overhead)
 138                 len -= r->overhead;
 139         else
 140                 len = 0;
 141
 142         return len;
 143 }
 144
 145 /* GSO packet is too big, segment it so that tbf can transmit
 146  * each segment in time
 147  */
 148 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 149                        struct sk_buff **to_free)
 150 {
 151         struct tbf_sched_data *q = qdisc_priv(sch);
 152         struct sk_buff *segs, *nskb;
 153         netdev_features_t features = netif_skb_features(skb);
 154         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 155         int ret, nb;
 156
 157         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 158
 159         if (IS_ERR_OR_NULL(segs))
 160                 return qdisc_drop(skb, sch, to_free);
 161
 162         nb = 0;
 163         while (segs) {
 164                 nskb = segs->next;
 165                 skb_mark_not_on_list(segs);
 166                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 167                 len += segs->len;
 168                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 169                 if (ret != NET_XMIT_SUCCESS) {
 170                         if (net_xmit_drop_count(ret))
 171                                 qdisc_qstats_drop(sch);
 172                 } else {
 173                         nb++;
 174                 }
 175                 segs = nskb;
 176         }
 177         sch->q.qlen += nb;
 178         if (nb > 1)
 179                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 180         consume_skb(skb);
 181         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 182 }
 183
 184 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 185                        struct sk_buff **to_free)
 186 {
 187         struct tbf_sched_data *q = qdisc_priv(sch);
 188         unsigned int len = qdisc_pkt_len(skb);
 189         int ret;
 190
 191         if (qdisc_pkt_len(skb) > q->max_size) {
 192                 if (skb_is_gso(skb) &&
 193                     skb_gso_validate_mac_len(skb, q->max_size))
 194                         return tbf_segment(skb, sch, to_free);
 195                 return qdisc_drop(skb, sch, to_free);
 196         }
 197         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 198         if (ret != NET_XMIT_SUCCESS) {
 199                 if (net_xmit_drop_count(ret))
 200                         qdisc_qstats_drop(sch);
 201                 return ret;
 202         }
 203
 204         sch->qstats.backlog += len;
 205         sch->q.qlen++;
 206         return NET_XMIT_SUCCESS;
 207 }
 208
 209 static bool tbf_peak_present(const struct tbf_sched_data *q)
 210 {
 211         return q->peak.rate_bytes_ps;
 212 }
 213
 214 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 215 {
 216         struct tbf_sched_data *q = qdisc_priv(sch);
 217         struct sk_buff *skb;
 218
 219         skb = q->qdisc->ops->peek(q->qdisc);
 220
 221         if (skb) {
 222                 s64 now;
 223                 s64 toks;
 224                 s64 ptoks = 0;
 225                 unsigned int len = qdisc_pkt_len(skb);
 226
 227                 now = ktime_get_ns();
 228                 toks = min_t(s64, now - q->t_c, q->buffer);
 229
 230                 if (tbf_peak_present(q)) {
 231                         ptoks = toks + q->ptokens;
 232                         if (ptoks > q->mtu)
 233                                 ptoks = q->mtu;
 234                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 235                 }
 236                 toks += q->tokens;
 237                 if (toks > q->buffer)
 238                         toks = q->buffer;
 239                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 240
 241                 if ((toks|ptoks) >= 0) {
 242                         skb = qdisc_dequeue_peeked(q->qdisc);
 243                         if (unlikely(!skb))
 244                                 return NULL;
 245
 246                         q->t_c = now;
 247                         q->tokens = toks;
 248                         q->ptokens = ptoks;
 249                         qdisc_qstats_backlog_dec(sch, skb);
 250                         sch->q.qlen--;
 251                         qdisc_bstats_update(sch, skb);
 252                         return skb;
 253                 }
 254
 255                 qdisc_watchdog_schedule_ns(&q->watchdog,
 256                                            now + max_t(long, -toks, -ptoks));
 257
 258                 /* Maybe we have a shorter packet in the queue,
 259                    which can be sent now. It sounds cool,
 260                    but, however, this is wrong in principle.
 261                    We MUST NOT reorder packets under these circumstances.
 262
 263                    Really, if we split the flow into independent
 264                    subflows, it would be a very good solution.
 265                    This is the main idea of all FQ algorithms
 266                    (cf. CSZ, HPFQ, HFSC)
 267                  */
 268
 269                 qdisc_qstats_overlimit(sch);
 270         }
 271         return NULL;
 272 }
 273
 274 static void tbf_reset(struct Qdisc *sch)
 275 {
 276         struct tbf_sched_data *q = qdisc_priv(sch);
 277
 278         qdisc_reset(q->qdisc);
 279         sch->qstats.backlog = 0;
 280         sch->q.qlen = 0;
 281         q->t_c = ktime_get_ns();
 282         q->tokens = q->buffer;
 283         q->ptokens = q->mtu;
 284         qdisc_watchdog_cancel(&q->watchdog);
 285 }
 286
 287 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 288         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 289         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 290         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 291         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 292         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 293         [TCA_TBF_BURST] = { .type = NLA_U32 },
 294         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 295 };
 296
 297 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 298                       struct netlink_ext_ack *extack)
 299 {
 300         int err;
 301         struct tbf_sched_data *q = qdisc_priv(sch);
 302         struct nlattr *tb[TCA_TBF_MAX + 1];
 303         struct tc_tbf_qopt *qopt;
 304         struct Qdisc *child = NULL;
 305         struct psched_ratecfg rate;
 306         struct psched_ratecfg peak;
 307         u64 max_size;
 308         s64 buffer, mtu;
 309         u64 rate64 = 0, prate64 = 0;
 310
 311         err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 312                                           NULL);
 313         if (err < 0)
 314                 return err;
 315
 316         err = -EINVAL;
 317         if (tb[TCA_TBF_PARMS] == NULL)
 318                 goto done;
 319
 320         qopt = nla_data(tb[TCA_TBF_PARMS]);
 321         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 322                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 323                                               tb[TCA_TBF_RTAB],
 324                                               NULL));
 325
 326         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 327                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 328                                                       tb[TCA_TBF_PTAB],
 329                                                       NULL));
 330
 331         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 332         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 333
 334         if (tb[TCA_TBF_RATE64])
 335                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 336         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 337
 338         if (tb[TCA_TBF_BURST]) {
 339                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 340                 buffer = psched_l2t_ns(&rate, max_size);
 341         } else {
 342                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 343         }
 344
 345         if (qopt->peakrate.rate) {
 346                 if (tb[TCA_TBF_PRATE64])
 347                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 348                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 349                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 350                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 351                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 352                         err = -EINVAL;
 353                         goto done;
 354                 }
 355
 356                 if (tb[TCA_TBF_PBURST]) {
 357                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 358                         max_size = min_t(u32, max_size, pburst);
 359                         mtu = psched_l2t_ns(&peak, pburst);
 360                 } else {
 361                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 362                 }
 363         } else {
 364                 memset(&peak, 0, sizeof(peak));
 365         }
 366
 367         if (max_size < psched_mtu(qdisc_dev(sch)))
 368                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 369                                     max_size, qdisc_dev(sch)->name,
 370                                     psched_mtu(qdisc_dev(sch)));
 371
 372         if (!max_size) {
 373                 err = -EINVAL;
 374                 goto done;
 375         }
 376
 377         if (q->qdisc != &noop_qdisc) {
 378                 err = fifo_set_limit(q->qdisc, qopt->limit);
 379                 if (err)
 380                         goto done;
 381         } else if (qopt->limit > 0) {
 382                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 383                                          extack);
 384                 if (IS_ERR(child)) {
 385                         err = PTR_ERR(child);
 386                         goto done;
 387                 }
 388
 389                 /* child is fifo, no need to check for noop_qdisc */
 390                 qdisc_hash_add(child, true);
 391         }
 392
 393         sch_tree_lock(sch);
 394         if (child) {
 395                 qdisc_tree_flush_backlog(q->qdisc);
 396                 qdisc_put(q->qdisc);
 397                 q->qdisc = child;
 398         }
 399         q->limit = qopt->limit;
 400         if (tb[TCA_TBF_PBURST])
 401                 q->mtu = mtu;
 402         else
 403                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 404         q->max_size = max_size;
 405         if (tb[TCA_TBF_BURST])
 406                 q->buffer = buffer;
 407         else
 408                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 409         q->tokens = q->buffer;
 410         q->ptokens = q->mtu;
 411
 412         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 413         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 414
 415         sch_tree_unlock(sch);
 416         err = 0;
 417 done:
 418         return err;
 419 }
 420
 421 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 422                     struct netlink_ext_ack *extack)
 423 {
 424         struct tbf_sched_data *q = qdisc_priv(sch);
 425
 426         qdisc_watchdog_init(&q->watchdog, sch);
 427         q->qdisc = &noop_qdisc;
 428
 429         if (!opt)
 430                 return -EINVAL;
 431
 432         q->t_c = ktime_get_ns();
 433
 434         return tbf_change(sch, opt, extack);
 435 }
 436
 437 static void tbf_destroy(struct Qdisc *sch)
 438 {
 439         struct tbf_sched_data *q = qdisc_priv(sch);
 440
 441         qdisc_watchdog_cancel(&q->watchdog);
 442         qdisc_put(q->qdisc);
 443 }
 444
 445 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 446 {
 447         struct tbf_sched_data *q = qdisc_priv(sch);
 448         struct nlattr *nest;
 449         struct tc_tbf_qopt opt;
 450
 451         sch->qstats.backlog = q->qdisc->qstats.backlog;
 452         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 453         if (nest == NULL)
 454                 goto nla_put_failure;
 455
 456         opt.limit = q->limit;
 457         psched_ratecfg_getrate(&opt.rate, &q->rate);
 458         if (tbf_peak_present(q))
 459                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 460         else
 461                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 462         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 463         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 464         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 465                 goto nla_put_failure;
 466         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 467             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 468                               TCA_TBF_PAD))
 469                 goto nla_put_failure;
 470         if (tbf_peak_present(q) &&
 471             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 472             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 473                               TCA_TBF_PAD))
 474                 goto nla_put_failure;
 475
 476         return nla_nest_end(skb, nest);
 477
 478 nla_put_failure:
 479         nla_nest_cancel(skb, nest);
 480         return -1;
 481 }
 482
 483 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 484                           struct sk_buff *skb, struct tcmsg *tcm)
 485 {
 486         struct tbf_sched_data *q = qdisc_priv(sch);
 487
 488         tcm->tcm_handle |= TC_H_MIN(1);
 489         tcm->tcm_info = q->qdisc->handle;
 490
 491         return 0;
 492 }
 493
 494 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 495                      struct Qdisc **old, struct netlink_ext_ack *extack)
 496 {
 497         struct tbf_sched_data *q = qdisc_priv(sch);
 498
 499         if (new == NULL)
 500                 new = &noop_qdisc;
 501
 502         *old = qdisc_replace(sch, new, &q->qdisc);
 503         return 0;
 504 }
 505
 506 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 507 {
 508         struct tbf_sched_data *q = qdisc_priv(sch);
 509         return q->qdisc;
 510 }
 511
 512 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 513 {
 514         return 1;
 515 }
 516
 517 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 518 {
 519         if (!walker->stop) {
 520                 if (walker->count >= walker->skip)
 521                         if (walker->fn(sch, 1, walker) < 0) {
 522                                 walker->stop = 1;
 523                                 return;
 524                         }
 525                 walker->count++;
 526         }
 527 }
 528
 529 static const struct Qdisc_class_ops tbf_class_ops = {
 530         .graft          =       tbf_graft,
 531         .leaf           =       tbf_leaf,
 532         .find           =       tbf_find,
 533         .walk           =       tbf_walk,
 534         .dump           =       tbf_dump_class,
 535 };
 536
 537 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 538         .next           =       NULL,
 539         .cl_ops         =       &tbf_class_ops,
 540         .id             =       "tbf",
 541         .priv_size      =       sizeof(struct tbf_sched_data),
 542         .enqueue        =       tbf_enqueue,
 543         .dequeue        =       tbf_dequeue,
 544         .peek           =       qdisc_peek_dequeued,
 545         .init           =       tbf_init,
 546         .reset          =       tbf_reset,
 547         .destroy        =       tbf_destroy,
 548         .change         =       tbf_change,
 549         .dump           =       tbf_dump,
 550         .owner          =       THIS_MODULE,
 551 };
 552
 553 static int __init tbf_module_init(void)
 554 {
 555         return register_qdisc(&tbf_qdisc_ops);
 556 }
 557
 558 static void __exit tbf_module_exit(void)
 559 {
 560         unregister_qdisc(&tbf_qdisc_ops);
 561 }
 562 module_init(tbf_module_init)
 563 module_exit(tbf_module_exit)
 564 MODULE_LICENSE("GPL");