1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
39
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
46
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52 /*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142 /* The list of all installed queueing disciplines. */
143
144 static struct Qdisc_ops *qdisc_base;
145
146 /* Register/uregister queueing discipline */
147
148 int register_qdisc(struct Qdisc_ops *qops)
149 {
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168 out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171 }
172
173 int unregister_qdisc(struct Qdisc_ops *qops)
174 {
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189 }
190
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 {
197 struct Qdisc *q;
198
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
204 }
205 }
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
208 }
209
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211 {
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225 }
226
227 /* Find queueing discipline by name */
228
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230 {
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245 }
246
247 static struct qdisc_rate_table *qdisc_rtab_list;
248
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250 {
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
262
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
270 }
271 return rtab;
272 }
273
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
275 {
276 struct qdisc_rate_table *rtab, **rtabp;
277
278 if (!tab || --tab->refcnt)
279 return;
280
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
286 }
287 }
288 }
289
290
291 /* Allocate an unique handle from space managed by kernel */
292
293 static u32 qdisc_alloc_handle(struct net_device *dev)
294 {
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303
304 return i>0 ? autohandle : 0;
305 }
306
307 /* Attach toplevel qdisc to device dev */
308
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311 {
312 struct Qdisc *oqdisc;
313
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
316
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
327 }
328
329 } else {
330
331 oqdisc = dev->qdisc_sleeping;
332
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
336
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
342 }
343
344 qdisc_unlock_tree(dev);
345
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
348
349 return oqdisc;
350 }
351
352
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
355
356 Old qdisc is not destroyed but returned in *old.
357 */
358
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
362 {
363 int err = 0;
364 struct Qdisc *q = *old;
365
366
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
372 }
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375
376 err = -EINVAL;
377
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
385 }
386 }
387 }
388 return err;
389 }
390
391 /*
392 Allocate and initialize new qdisc.
393
394 Parameters are passed via opt.
395 */
396
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399 {
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch;
404 struct Qdisc_ops *ops;
405 int size;
406
407 ops = qdisc_lookup_ops(kind);
408 #ifdef CONFIG_KMOD
409 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
410 char name[IFNAMSIZ];
411 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 request_module("sch_%s", name);
413 ops = qdisc_lookup_ops(kind);
414 }
415 }
416 #endif
417
418 err = -EINVAL;
419 if (ops == NULL)
420 goto err_out;
421
422 /* ensure that the Qdisc and the private data are 32-byte aligned */
423 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
424 size += ops->priv_size + QDISC_ALIGN_CONST;
425
426 p = kmalloc(size, GFP_KERNEL);
427 err = -ENOBUFS;
428 if (!p)
429 goto err_out2;
430 memset(p, 0, size);
431 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
432 & ~QDISC_ALIGN_CONST);
433 sch->padded = (char *)sch - (char *)p;
434
435 INIT_LIST_HEAD(&sch->list);
436 skb_queue_head_init(&sch->q);
437
438 if (handle == TC_H_INGRESS)
439 sch->flags |= TCQ_F_INGRESS;
440
441 sch->ops = ops;
442 sch->enqueue = ops->enqueue;
443 sch->dequeue = ops->dequeue;
444 sch->dev = dev;
445 dev_hold(dev);
446 atomic_set(&sch->refcnt, 1);
447 sch->stats_lock = &dev->queue_lock;
448 if (handle == 0) {
449 handle = qdisc_alloc_handle(dev);
450 err = -ENOMEM;
451 if (handle == 0)
452 goto err_out3;
453 }
454
455 if (handle == TC_H_INGRESS)
456 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
457 else
458 sch->handle = handle;
459
460 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
461 qdisc_lock_tree(dev);
462 list_add_tail(&sch->list, &dev->qdisc_list);
463 qdisc_unlock_tree(dev);
464
465 #ifdef CONFIG_NET_ESTIMATOR
466 if (tca[TCA_RATE-1])
467 gen_new_estimator(&sch->bstats, &sch->rate_est,
468 sch->stats_lock, tca[TCA_RATE-1]);
469 #endif
470 return sch;
471 }
472 err_out3:
473 dev_put(dev);
474 err_out2:
475 module_put(ops->owner);
476 err_out:
477 *errp = err;
478 if (p)
479 kfree(p);
480 return NULL;
481 }
482
483 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
484 {
485 if (tca[TCA_OPTIONS-1]) {
486 int err;
487
488 if (sch->ops->change == NULL)
489 return -EINVAL;
490 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
491 if (err)
492 return err;
493 }
494 #ifdef CONFIG_NET_ESTIMATOR
495 if (tca[TCA_RATE-1])
496 gen_replace_estimator(&sch->bstats, &sch->rate_est,
497 sch->stats_lock, tca[TCA_RATE-1]);
498 #endif
499 return 0;
500 }
501
502 struct check_loop_arg
503 {
504 struct qdisc_walker w;
505 struct Qdisc *p;
506 int depth;
507 };
508
509 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
510
511 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
512 {
513 struct check_loop_arg arg;
514
515 if (q->ops->cl_ops == NULL)
516 return 0;
517
518 arg.w.stop = arg.w.skip = arg.w.count = 0;
519 arg.w.fn = check_loop_fn;
520 arg.depth = depth;
521 arg.p = p;
522 q->ops->cl_ops->walk(q, &arg.w);
523 return arg.w.stop ? -ELOOP : 0;
524 }
525
526 static int
527 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
528 {
529 struct Qdisc *leaf;
530 struct Qdisc_class_ops *cops = q->ops->cl_ops;
531 struct check_loop_arg *arg = (struct check_loop_arg *)w;
532
533 leaf = cops->leaf(q, cl);
534 if (leaf) {
535 if (leaf == arg->p || arg->depth > 7)
536 return -ELOOP;
537 return check_loop(leaf, arg->p, arg->depth + 1);
538 }
539 return 0;
540 }
541
542 /*
543 * Delete/get qdisc.
544 */
545
546 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
547 {
548 struct tcmsg *tcm = NLMSG_DATA(n);
549 struct rtattr **tca = arg;
550 struct net_device *dev;
551 u32 clid = tcm->tcm_parent;
552 struct Qdisc *q = NULL;
553 struct Qdisc *p = NULL;
554 int err;
555
556 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
557 return -ENODEV;
558
559 if (clid) {
560 if (clid != TC_H_ROOT) {
561 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
562 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
563 return -ENOENT;
564 q = qdisc_leaf(p, clid);
565 } else { /* ingress */
566 q = dev->qdisc_ingress;
567 }
568 } else {
569 q = dev->qdisc_sleeping;
570 }
571 if (!q)
572 return -ENOENT;
573
574 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
575 return -EINVAL;
576 } else {
577 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
578 return -ENOENT;
579 }
580
581 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
582 return -EINVAL;
583
584 if (n->nlmsg_type == RTM_DELQDISC) {
585 if (!clid)
586 return -EINVAL;
587 if (q->handle == 0)
588 return -ENOENT;
589 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
590 return err;
591 if (q) {
592 qdisc_notify(skb, n, clid, q, NULL);
593 spin_lock_bh(&dev->queue_lock);
594 qdisc_destroy(q);
595 spin_unlock_bh(&dev->queue_lock);
596 }
597 } else {
598 qdisc_notify(skb, n, clid, NULL, q);
599 }
600 return 0;
601 }
602
603 /*
604 Create/change qdisc.
605 */
606
607 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
608 {
609 struct tcmsg *tcm = NLMSG_DATA(n);
610 struct rtattr **tca = arg;
611 struct net_device *dev;
612 u32 clid = tcm->tcm_parent;
613 struct Qdisc *q = NULL;
614 struct Qdisc *p = NULL;
615 int err;
616
617 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
618 return -ENODEV;
619
620 if (clid) {
621 if (clid != TC_H_ROOT) {
622 if (clid != TC_H_INGRESS) {
623 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
624 return -ENOENT;
625 q = qdisc_leaf(p, clid);
626 } else { /*ingress */
627 q = dev->qdisc_ingress;
628 }
629 } else {
630 q = dev->qdisc_sleeping;
631 }
632
633 /* It may be default qdisc, ignore it */
634 if (q && q->handle == 0)
635 q = NULL;
636
637 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
638 if (tcm->tcm_handle) {
639 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
640 return -EEXIST;
641 if (TC_H_MIN(tcm->tcm_handle))
642 return -EINVAL;
643 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
644 goto create_n_graft;
645 if (n->nlmsg_flags&NLM_F_EXCL)
646 return -EEXIST;
647 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
648 return -EINVAL;
649 if (q == p ||
650 (p && check_loop(q, p, 0)))
651 return -ELOOP;
652 atomic_inc(&q->refcnt);
653 goto graft;
654 } else {
655 if (q == NULL)
656 goto create_n_graft;
657
658 /* This magic test requires explanation.
659 *
660 * We know, that some child q is already
661 * attached to this parent and have choice:
662 * either to change it or to create/graft new one.
663 *
664 * 1. We are allowed to create/graft only
665 * if CREATE and REPLACE flags are set.
666 *
667 * 2. If EXCL is set, requestor wanted to say,
668 * that qdisc tcm_handle is not expected
669 * to exist, so that we choose create/graft too.
670 *
671 * 3. The last case is when no flags are set.
672 * Alas, it is sort of hole in API, we
673 * cannot decide what to do unambiguously.
674 * For now we select create/graft, if
675 * user gave KIND, which does not match existing.
676 */
677 if ((n->nlmsg_flags&NLM_F_CREATE) &&
678 (n->nlmsg_flags&NLM_F_REPLACE) &&
679 ((n->nlmsg_flags&NLM_F_EXCL) ||
680 (tca[TCA_KIND-1] &&
681 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
682 goto create_n_graft;
683 }
684 }
685 } else {
686 if (!tcm->tcm_handle)
687 return -EINVAL;
688 q = qdisc_lookup(dev, tcm->tcm_handle);
689 }
690
691 /* Change qdisc parameters */
692 if (q == NULL)
693 return -ENOENT;
694 if (n->nlmsg_flags&NLM_F_EXCL)
695 return -EEXIST;
696 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
697 return -EINVAL;
698 err = qdisc_change(q, tca);
699 if (err == 0)
700 qdisc_notify(skb, n, clid, NULL, q);
701 return err;
702
703 create_n_graft:
704 if (!(n->nlmsg_flags&NLM_F_CREATE))
705 return -ENOENT;
706 if (clid == TC_H_INGRESS)
707 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
708 else
709 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
710 if (q == NULL)
711 return err;
712
713 graft:
714 if (1) {
715 struct Qdisc *old_q = NULL;
716 err = qdisc_graft(dev, p, clid, q, &old_q);
717 if (err) {
718 if (q) {
719 spin_lock_bh(&dev->queue_lock);
720 qdisc_destroy(q);
721 spin_unlock_bh(&dev->queue_lock);
722 }
723 return err;
724 }
725 qdisc_notify(skb, n, clid, old_q, q);
726 if (old_q) {
727 spin_lock_bh(&dev->queue_lock);
728 qdisc_destroy(old_q);
729 spin_unlock_bh(&dev->queue_lock);
730 }
731 }
732 return 0;
733 }
734
735 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
736 u32 pid, u32 seq, unsigned flags, int event)
737 {
738 struct tcmsg *tcm;
739 struct nlmsghdr *nlh;
740 unsigned char *b = skb->tail;
741 struct gnet_dump d;
742
743 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
744 nlh->nlmsg_flags = flags;
745 tcm = NLMSG_DATA(nlh);
746 tcm->tcm_family = AF_UNSPEC;
747 tcm->tcm_ifindex = q->dev->ifindex;
748 tcm->tcm_parent = clid;
749 tcm->tcm_handle = q->handle;
750 tcm->tcm_info = atomic_read(&q->refcnt);
751 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
752 if (q->ops->dump && q->ops->dump(q, skb) < 0)
753 goto rtattr_failure;
754 q->qstats.qlen = q->q.qlen;
755
756 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
757 TCA_XSTATS, q->stats_lock, &d) < 0)
758 goto rtattr_failure;
759
760 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
761 goto rtattr_failure;
762
763 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
764 #ifdef CONFIG_NET_ESTIMATOR
765 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
766 #endif
767 gnet_stats_copy_queue(&d, &q->qstats) < 0)
768 goto rtattr_failure;
769
770 if (gnet_stats_finish_copy(&d) < 0)
771 goto rtattr_failure;
772
773 nlh->nlmsg_len = skb->tail - b;
774 return skb->len;
775
776 nlmsg_failure:
777 rtattr_failure:
778 skb_trim(skb, b - skb->data);
779 return -1;
780 }
781
782 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
783 u32 clid, struct Qdisc *old, struct Qdisc *new)
784 {
785 struct sk_buff *skb;
786 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
787
788 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
789 if (!skb)
790 return -ENOBUFS;
791
792 if (old && old->handle) {
793 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
794 goto err_out;
795 }
796 if (new) {
797 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
798 goto err_out;
799 }
800
801 if (skb->len)
802 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
803
804 err_out:
805 kfree_skb(skb);
806 return -EINVAL;
807 }
808
809 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
810 {
811 int idx, q_idx;
812 int s_idx, s_q_idx;
813 struct net_device *dev;
814 struct Qdisc *q;
815
816 s_idx = cb->args[0];
817 s_q_idx = q_idx = cb->args[1];
818 read_lock(&dev_base_lock);
819 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
820 if (idx < s_idx)
821 continue;
822 if (idx > s_idx)
823 s_q_idx = 0;
824 read_lock_bh(&qdisc_tree_lock);
825 q_idx = 0;
826 list_for_each_entry(q, &dev->qdisc_list, list) {
827 if (q_idx < s_q_idx) {
828 q_idx++;
829 continue;
830 }
831 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
832 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
833 read_unlock_bh(&qdisc_tree_lock);
834 goto done;
835 }
836 q_idx++;
837 }
838 read_unlock_bh(&qdisc_tree_lock);
839 }
840
841 done:
842 read_unlock(&dev_base_lock);
843
844 cb->args[0] = idx;
845 cb->args[1] = q_idx;
846
847 return skb->len;
848 }
849
850
851
852 /************************************************
853 * Traffic classes manipulation. *
854 ************************************************/
855
856
857
858 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
859 {
860 struct tcmsg *tcm = NLMSG_DATA(n);
861 struct rtattr **tca = arg;
862 struct net_device *dev;
863 struct Qdisc *q = NULL;
864 struct Qdisc_class_ops *cops;
865 unsigned long cl = 0;
866 unsigned long new_cl;
867 u32 pid = tcm->tcm_parent;
868 u32 clid = tcm->tcm_handle;
869 u32 qid = TC_H_MAJ(clid);
870 int err;
871
872 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
873 return -ENODEV;
874
875 /*
876 parent == TC_H_UNSPEC - unspecified parent.
877 parent == TC_H_ROOT - class is root, which has no parent.
878 parent == X:0 - parent is root class.
879 parent == X:Y - parent is a node in hierarchy.
880 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
881
882 handle == 0:0 - generate handle from kernel pool.
883 handle == 0:Y - class is X:Y, where X:0 is qdisc.
884 handle == X:Y - clear.
885 handle == X:0 - root class.
886 */
887
888 /* Step 1. Determine qdisc handle X:0 */
889
890 if (pid != TC_H_ROOT) {
891 u32 qid1 = TC_H_MAJ(pid);
892
893 if (qid && qid1) {
894 /* If both majors are known, they must be identical. */
895 if (qid != qid1)
896 return -EINVAL;
897 } else if (qid1) {
898 qid = qid1;
899 } else if (qid == 0)
900 qid = dev->qdisc_sleeping->handle;
901
902 /* Now qid is genuine qdisc handle consistent
903 both with parent and child.
904
905 TC_H_MAJ(pid) still may be unspecified, complete it now.
906 */
907 if (pid)
908 pid = TC_H_MAKE(qid, pid);
909 } else {
910 if (qid == 0)
911 qid = dev->qdisc_sleeping->handle;
912 }
913
914 /* OK. Locate qdisc */
915 if ((q = qdisc_lookup(dev, qid)) == NULL)
916 return -ENOENT;
917
918 /* An check that it supports classes */
919 cops = q->ops->cl_ops;
920 if (cops == NULL)
921 return -EINVAL;
922
923 /* Now try to get class */
924 if (clid == 0) {
925 if (pid == TC_H_ROOT)
926 clid = qid;
927 } else
928 clid = TC_H_MAKE(qid, clid);
929
930 if (clid)
931 cl = cops->get(q, clid);
932
933 if (cl == 0) {
934 err = -ENOENT;
935 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
936 goto out;
937 } else {
938 switch (n->nlmsg_type) {
939 case RTM_NEWTCLASS:
940 err = -EEXIST;
941 if (n->nlmsg_flags&NLM_F_EXCL)
942 goto out;
943 break;
944 case RTM_DELTCLASS:
945 err = cops->delete(q, cl);
946 if (err == 0)
947 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
948 goto out;
949 case RTM_GETTCLASS:
950 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
951 goto out;
952 default:
953 err = -EINVAL;
954 goto out;
955 }
956 }
957
958 new_cl = cl;
959 err = cops->change(q, clid, pid, tca, &new_cl);
960 if (err == 0)
961 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
962
963 out:
964 if (cl)
965 cops->put(q, cl);
966
967 return err;
968 }
969
970
971 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
972 unsigned long cl,
973 u32 pid, u32 seq, unsigned flags, int event)
974 {
975 struct tcmsg *tcm;
976 struct nlmsghdr *nlh;
977 unsigned char *b = skb->tail;
978 struct gnet_dump d;
979 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
980
981 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
982 nlh->nlmsg_flags = flags;
983 tcm = NLMSG_DATA(nlh);
984 tcm->tcm_family = AF_UNSPEC;
985 tcm->tcm_ifindex = q->dev->ifindex;
986 tcm->tcm_parent = q->handle;
987 tcm->tcm_handle = q->handle;
988 tcm->tcm_info = 0;
989 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
990 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
991 goto rtattr_failure;
992
993 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
994 TCA_XSTATS, q->stats_lock, &d) < 0)
995 goto rtattr_failure;
996
997 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
998 goto rtattr_failure;
999
1000 if (gnet_stats_finish_copy(&d) < 0)
1001 goto rtattr_failure;
1002
1003 nlh->nlmsg_len = skb->tail - b;
1004 return skb->len;
1005
1006 nlmsg_failure:
1007 rtattr_failure:
1008 skb_trim(skb, b - skb->data);
1009 return -1;
1010 }
1011
1012 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1013 struct Qdisc *q, unsigned long cl, int event)
1014 {
1015 struct sk_buff *skb;
1016 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1017
1018 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1019 if (!skb)
1020 return -ENOBUFS;
1021
1022 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1023 kfree_skb(skb);
1024 return -EINVAL;
1025 }
1026
1027 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1028 }
1029
1030 struct qdisc_dump_args
1031 {
1032 struct qdisc_walker w;
1033 struct sk_buff *skb;
1034 struct netlink_callback *cb;
1035 };
1036
1037 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1038 {
1039 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1040
1041 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1042 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1043 }
1044
1045 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1046 {
1047 int t;
1048 int s_t;
1049 struct net_device *dev;
1050 struct Qdisc *q;
1051 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1052 struct qdisc_dump_args arg;
1053
1054 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1055 return 0;
1056 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1057 return 0;
1058
1059 s_t = cb->args[0];
1060 t = 0;
1061
1062 read_lock_bh(&qdisc_tree_lock);
1063 list_for_each_entry(q, &dev->qdisc_list, list) {
1064 if (t < s_t || !q->ops->cl_ops ||
1065 (tcm->tcm_parent &&
1066 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1067 t++;
1068 continue;
1069 }
1070 if (t > s_t)
1071 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1072 arg.w.fn = qdisc_class_dump;
1073 arg.skb = skb;
1074 arg.cb = cb;
1075 arg.w.stop = 0;
1076 arg.w.skip = cb->args[1];
1077 arg.w.count = 0;
1078 q->ops->cl_ops->walk(q, &arg.w);
1079 cb->args[1] = arg.w.count;
1080 if (arg.w.stop)
1081 break;
1082 t++;
1083 }
1084 read_unlock_bh(&qdisc_tree_lock);
1085
1086 cb->args[0] = t;
1087
1088 dev_put(dev);
1089 return skb->len;
1090 }
1091
1092 /* Main classifier routine: scans classifier chain attached
1093 to this qdisc, (optionally) tests for protocol and asks
1094 specific classifiers.
1095 */
1096 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1097 struct tcf_result *res)
1098 {
1099 int err = 0;
1100 u32 protocol = skb->protocol;
1101 #ifdef CONFIG_NET_CLS_ACT
1102 struct tcf_proto *otp = tp;
1103 reclassify:
1104 #endif
1105 protocol = skb->protocol;
1106
1107 for ( ; tp; tp = tp->next) {
1108 if ((tp->protocol == protocol ||
1109 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1110 (err = tp->classify(skb, tp, res)) >= 0) {
1111 #ifdef CONFIG_NET_CLS_ACT
1112 if ( TC_ACT_RECLASSIFY == err) {
1113 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1114 tp = otp;
1115
1116 if (MAX_REC_LOOP < verd++) {
1117 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1118 tp->prio&0xffff, ntohs(tp->protocol));
1119 return TC_ACT_SHOT;
1120 }
1121 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1122 goto reclassify;
1123 } else {
1124 if (skb->tc_verd)
1125 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1126 return err;
1127 }
1128 #else
1129
1130 return err;
1131 #endif
1132 }
1133
1134 }
1135 return -1;
1136 }
1137
1138 static int psched_us_per_tick = 1;
1139 static int psched_tick_per_us = 1;
1140
1141 #ifdef CONFIG_PROC_FS
1142 static int psched_show(struct seq_file *seq, void *v)
1143 {
1144 seq_printf(seq, "%08x %08x %08x %08x\n",
1145 psched_tick_per_us, psched_us_per_tick,
1146 1000000, HZ);
1147
1148 return 0;
1149 }
1150
1151 static int psched_open(struct inode *inode, struct file *file)
1152 {
1153 return single_open(file, psched_show, PDE(inode)->data);
1154 }
1155
1156 static struct file_operations psched_fops = {
1157 .owner = THIS_MODULE,
1158 .open = psched_open,
1159 .read = seq_read,
1160 .llseek = seq_lseek,
1161 .release = single_release,
1162 };
1163 #endif
1164
1165 #ifdef CONFIG_NET_SCH_CLK_CPU
1166 psched_tdiff_t psched_clock_per_hz;
1167 int psched_clock_scale;
1168 EXPORT_SYMBOL(psched_clock_per_hz);
1169 EXPORT_SYMBOL(psched_clock_scale);
1170
1171 psched_time_t psched_time_base;
1172 cycles_t psched_time_mark;
1173 EXPORT_SYMBOL(psched_time_mark);
1174 EXPORT_SYMBOL(psched_time_base);
1175
1176 /*
1177 * Periodically adjust psched_time_base to avoid overflow
1178 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1179 */
1180 static void psched_tick(unsigned long);
1181 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1182
1183 static void psched_tick(unsigned long dummy)
1184 {
1185 if (sizeof(cycles_t) == sizeof(u32)) {
1186 psched_time_t dummy_stamp;
1187 PSCHED_GET_TIME(dummy_stamp);
1188 psched_timer.expires = jiffies + 1*HZ;
1189 add_timer(&psched_timer);
1190 }
1191 }
1192
1193 int __init psched_calibrate_clock(void)
1194 {
1195 psched_time_t stamp, stamp1;
1196 struct timeval tv, tv1;
1197 psched_tdiff_t delay;
1198 long rdelay;
1199 unsigned long stop;
1200
1201 psched_tick(0);
1202 stop = jiffies + HZ/10;
1203 PSCHED_GET_TIME(stamp);
1204 do_gettimeofday(&tv);
1205 while (time_before(jiffies, stop)) {
1206 barrier();
1207 cpu_relax();
1208 }
1209 PSCHED_GET_TIME(stamp1);
1210 do_gettimeofday(&tv1);
1211
1212 delay = PSCHED_TDIFF(stamp1, stamp);
1213 rdelay = tv1.tv_usec - tv.tv_usec;
1214 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1215 if (rdelay > delay)
1216 return -1;
1217 delay /= rdelay;
1218 psched_tick_per_us = delay;
1219 while ((delay>>=1) != 0)
1220 psched_clock_scale++;
1221 psched_us_per_tick = 1<<psched_clock_scale;
1222 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1223 return 0;
1224 }
1225 #endif
1226
1227 static int __init pktsched_init(void)
1228 {
1229 struct rtnetlink_link *link_p;
1230
1231 #ifdef CONFIG_NET_SCH_CLK_CPU
1232 if (psched_calibrate_clock() < 0)
1233 return -1;
1234 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1235 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1236 psched_us_per_tick = 1000000;
1237 #endif
1238
1239 link_p = rtnetlink_links[PF_UNSPEC];
1240
1241 /* Setup rtnetlink links. It is made here to avoid
1242 exporting large number of public symbols.
1243 */
1244
1245 if (link_p) {
1246 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1247 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1248 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1249 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1250 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1251 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1252 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1253 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1254 }
1255
1256 register_qdisc(&pfifo_qdisc_ops);
1257 register_qdisc(&bfifo_qdisc_ops);
1258 proc_net_fops_create("psched", 0, &psched_fops);
1259
1260 return 0;
1261 }
1262
1263 subsys_initcall(pktsched_init);
1264
1265 EXPORT_SYMBOL(qdisc_get_rtab);
1266 EXPORT_SYMBOL(qdisc_put_rtab);
1267 EXPORT_SYMBOL(register_qdisc);
1268 EXPORT_SYMBOL(unregister_qdisc);
1269 EXPORT_SYMBOL(tc_classify);
1270
|
This page was automatically generated by the
LXR engine.
|