Skip to content

Commit aec0a40

Browse files
Eric Dumazetdavem330
Eric Dumazet
authored andcommitted
netem: use rb tree to implement the time queue
Following typical setup to implement a ~100 ms RTT and big amount of reorders has very poor performance because netem implements the time queue using a linked list. ----------------------------------------------------------- ETH=eth0 IFB=ifb0 modprobe ifb ip link set dev $IFB up tc qdisc add dev $ETH ingress 2>/dev/null tc filter add dev $ETH parent ffff: \ protocol ip u32 match u32 0 0 flowid 1:1 action mirred egress \ redirect dev $IFB ethtool -K $ETH gro off tso off gso off tc qdisc add dev $IFB root netem delay 50ms 10ms limit 100000 tc qd add dev $ETH root netem delay 50ms limit 100000 --------------------------------------------------------- Switch netem time queue to a rb tree, so this kind of setup can work at high speed. Signed-off-by: Eric Dumazet <[email protected]> Cc: Stephen Hemminger <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent d0b5e51 commit aec0a40

File tree

1 file changed

+85
-24
lines changed

1 file changed

+85
-24
lines changed

net/sched/sch_netem.c

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <linux/vmalloc.h>
2424
#include <linux/rtnetlink.h>
2525
#include <linux/reciprocal_div.h>
26+
#include <linux/rbtree.h>
2627

2728
#include <net/netlink.h>
2829
#include <net/pkt_sched.h>
@@ -68,7 +69,8 @@
6869
*/
6970

7071
struct netem_sched_data {
71-
/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
72+
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73+
struct rb_root t_root;
7274

7375
/* optional qdisc for classful handling (NULL at netem init) */
7476
struct Qdisc *qdisc;
@@ -128,10 +130,35 @@ struct netem_sched_data {
128130
*/
129131
struct netem_skb_cb {
130132
psched_time_t time_to_send;
133+
ktime_t tstamp_save;
131134
};
132135

136+
/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
137+
* to hold a rb_node structure.
138+
*
139+
* If struct sk_buff layout is changed, the following checks will complain.
140+
*/
141+
static struct rb_node *netem_rb_node(struct sk_buff *skb)
142+
{
143+
BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
144+
BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
145+
offsetof(struct sk_buff, next) + sizeof(skb->next));
146+
BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
147+
offsetof(struct sk_buff, prev) + sizeof(skb->prev));
148+
BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
149+
sizeof(skb->prev) +
150+
sizeof(skb->tstamp));
151+
return (struct rb_node *)&skb->next;
152+
}
153+
154+
static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
155+
{
156+
return (struct sk_buff *)rb;
157+
}
158+
133159
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
134160
{
161+
/* we assume we can use skb next/prev/tstamp as storage for rb_node */
135162
qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
136163
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
137164
}
@@ -333,20 +360,23 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
333360

334361
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
335362
{
336-
struct sk_buff_head *list = &sch->q;
363+
struct netem_sched_data *q = qdisc_priv(sch);
337364
psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
338-
struct sk_buff *skb = skb_peek_tail(list);
365+
struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
339366

340-
/* Optimize for add at tail */
341-
if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
342-
return __skb_queue_tail(list, nskb);
367+
while (*p) {
368+
struct sk_buff *skb;
343369

344-
skb_queue_reverse_walk(list, skb) {
370+
parent = *p;
371+
skb = netem_rb_to_skb(parent);
345372
if (tnext >= netem_skb_cb(skb)->time_to_send)
346-
break;
373+
p = &parent->rb_right;
374+
else
375+
p = &parent->rb_left;
347376
}
348-
349-
__skb_queue_after(list, skb, nskb);
377+
rb_link_node(netem_rb_node(nskb), parent, p);
378+
rb_insert_color(netem_rb_node(nskb), &q->t_root);
379+
sch->q.qlen++;
350380
}
351381

352382
/*
@@ -436,23 +466,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
436466
now = psched_get_time();
437467

438468
if (q->rate) {
439-
struct sk_buff_head *list = &sch->q;
469+
struct sk_buff *last;
440470

441-
if (!skb_queue_empty(list)) {
471+
if (!skb_queue_empty(&sch->q))
472+
last = skb_peek_tail(&sch->q);
473+
else
474+
last = netem_rb_to_skb(rb_last(&q->t_root));
475+
if (last) {
442476
/*
443477
* Last packet in queue is reference point (now),
444478
* calculate this time bonus and subtract
445479
* from delay.
446480
*/
447-
delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
481+
delay -= netem_skb_cb(last)->time_to_send - now;
448482
delay = max_t(psched_tdiff_t, 0, delay);
449-
now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
483+
now = netem_skb_cb(last)->time_to_send;
450484
}
451485

452486
delay += packet_len_2_sched_time(skb->len, q);
453487
}
454488

455489
cb->time_to_send = now + delay;
490+
cb->tstamp_save = skb->tstamp;
456491
++q->counter;
457492
tfifo_enqueue(skb, sch);
458493
} else {
@@ -476,6 +511,21 @@ static unsigned int netem_drop(struct Qdisc *sch)
476511
unsigned int len;
477512

478513
len = qdisc_queue_drop(sch);
514+
515+
if (!len) {
516+
struct rb_node *p = rb_first(&q->t_root);
517+
518+
if (p) {
519+
struct sk_buff *skb = netem_rb_to_skb(p);
520+
521+
rb_erase(p, &q->t_root);
522+
sch->q.qlen--;
523+
skb->next = NULL;
524+
skb->prev = NULL;
525+
len = qdisc_pkt_len(skb);
526+
kfree_skb(skb);
527+
}
528+
}
479529
if (!len && q->qdisc && q->qdisc->ops->drop)
480530
len = q->qdisc->ops->drop(q->qdisc);
481531
if (len)
@@ -488,19 +538,32 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
488538
{
489539
struct netem_sched_data *q = qdisc_priv(sch);
490540
struct sk_buff *skb;
541+
struct rb_node *p;
491542

492543
if (qdisc_is_throttled(sch))
493544
return NULL;
494545

495546
tfifo_dequeue:
496-
skb = qdisc_peek_head(sch);
547+
skb = __skb_dequeue(&sch->q);
497548
if (skb) {
498-
const struct netem_skb_cb *cb = netem_skb_cb(skb);
549+
deliver:
550+
sch->qstats.backlog -= qdisc_pkt_len(skb);
551+
qdisc_unthrottled(sch);
552+
qdisc_bstats_update(sch, skb);
553+
return skb;
554+
}
555+
p = rb_first(&q->t_root);
556+
if (p) {
557+
skb = netem_rb_to_skb(p);
499558

500559
/* if more time remaining? */
501-
if (cb->time_to_send <= psched_get_time()) {
502-
__skb_unlink(skb, &sch->q);
503-
sch->qstats.backlog -= qdisc_pkt_len(skb);
560+
if (netem_skb_cb(skb)->time_to_send <= psched_get_time()) {
561+
rb_erase(p, &q->t_root);
562+
563+
sch->q.qlen--;
564+
skb->next = NULL;
565+
skb->prev = NULL;
566+
skb->tstamp = netem_skb_cb(skb)->tstamp_save;
504567

505568
#ifdef CONFIG_NET_CLS_ACT
506569
/*
@@ -522,18 +585,16 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
522585
}
523586
goto tfifo_dequeue;
524587
}
525-
deliver:
526-
qdisc_unthrottled(sch);
527-
qdisc_bstats_update(sch, skb);
528-
return skb;
588+
goto deliver;
529589
}
530590

531591
if (q->qdisc) {
532592
skb = q->qdisc->ops->dequeue(q->qdisc);
533593
if (skb)
534594
goto deliver;
535595
}
536-
qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
596+
qdisc_watchdog_schedule(&q->watchdog,
597+
netem_skb_cb(skb)->time_to_send);
537598
}
538599

539600
if (q->qdisc) {

0 commit comments

Comments
 (0)