From aebe4426ccaa4838f36ea805cdf7d76503e65117 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:25 +0300 Subject: [PATCH 1/5] net: sched: Pass root lock to Qdisc_ops.enqueue A following patch introduces qevents, points in qdisc algorithm where packet can be processed by user-defined filters. Should this processing lead to a situation where a new packet is to be enqueued on the same port, holding the root lock would lead to deadlocks. To solve the issue, qevent handler needs to unlock and relock the root lock when necessary. To that end, add the root lock argument to the qdisc op enqueue, and propagate throughout. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 ++++-- net/core/dev.c | 4 ++-- net/sched/sch_atm.c | 4 ++-- net/sched/sch_blackhole.c | 2 +- net/sched/sch_cake.c | 2 +- net/sched/sch_cbq.c | 4 ++-- net/sched/sch_cbs.c | 18 +++++++++--------- net/sched/sch_choke.c | 2 +- net/sched/sch_codel.c | 2 +- net/sched/sch_drr.c | 4 ++-- net/sched/sch_dsmark.c | 4 ++-- net/sched/sch_etf.c | 2 +- net/sched/sch_ets.c | 4 ++-- net/sched/sch_fifo.c | 6 +++--- net/sched/sch_fq.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_fq_pie.c | 2 +- net/sched/sch_generic.c | 4 ++-- net/sched/sch_gred.c | 2 +- net/sched/sch_hfsc.c | 6 +++--- net/sched/sch_hhf.c | 2 +- net/sched/sch_htb.c | 4 ++-- net/sched/sch_multiq.c | 4 ++-- net/sched/sch_netem.c | 8 ++++---- net/sched/sch_pie.c | 2 +- net/sched/sch_plug.c | 2 +- net/sched/sch_prio.c | 6 +++--- net/sched/sch_qfq.c | 4 ++-- net/sched/sch_red.c | 4 ++-- net/sched/sch_sfb.c | 4 ++-- net/sched/sch_sfq.c | 2 +- net/sched/sch_skbprio.c | 2 +- net/sched/sch_taprio.c | 4 ++-- net/sched/sch_tbf.c | 10 +++++----- net/sched/sch_teql.c | 4 ++-- 35 files changed, 73 insertions(+), 71 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c510b03b97513..fceb3d63c9256 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -57,6 +57,7 @@ struct qdisc_skb_head { struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + spinlock_t *root_lock, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; @@ -241,6 +242,7 @@ struct Qdisc_ops { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + spinlock_t *root_lock, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); @@ -788,11 +790,11 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, #endif } -static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { qdisc_calculate_pkt_len(skb, sch); - return sch->enqueue(skb, sch, to_free); + return sch->enqueue(skb, sch, root_lock, to_free); } static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, diff --git a/net/core/dev.c b/net/core/dev.c index 3a46b86cbd67f..c02bae9278122 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3749,7 +3749,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + rc = q->enqueue(skb, q, NULL, &to_free) & NET_XMIT_MASK; qdisc_run(q); if (unlikely(to_free)) @@ -3792,7 +3792,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + rc = q->enqueue(skb, q, root_lock, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index ee12ca9f55b4f..fb6b16c4e46d6 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -374,7 +374,7 @@ static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl, /* --------------------------- Qdisc operations ---------------------------- */ -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct atm_qdisc_data *p = qdisc_priv(sch); @@ -432,7 +432,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, #endif } - ret = qdisc_enqueue(skb, flow->q, to_free); + ret = qdisc_enqueue(skb, flow->q, root_lock, to_free); if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index a7f7667ae9849..187644657c4f9 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -13,7 +13,7 @@ #include #include -static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { qdisc_drop(skb, sch, to_free); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 65a95cb094e8b..e9c502dd29a27 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1687,7 +1687,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, static void cake_reconfigure(struct Qdisc *sch); -static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct cake_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 39b427dc75128..052d4a1af69a4 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -356,7 +356,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) } static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct cbq_sched_data *q = qdisc_priv(sch); @@ -373,7 +373,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return ret; } - ret = qdisc_enqueue(skb, cl->q, to_free); + ret = qdisc_enqueue(skb, cl->q, root_lock, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; cbq_mark_toplevel(q, cl); diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 2eaac2ff380fa..7af15ebe07f78 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -77,7 +77,7 @@ struct cbs_sched_data { s64 sendslope; /* in bytes/s */ s64 idleslope; /* in bytes/s */ struct qdisc_watchdog watchdog; - int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); struct Qdisc *qdisc; @@ -85,13 +85,13 @@ struct cbs_sched_data { }; static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct Qdisc *child, + struct Qdisc *child, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); int err; - err = child->ops->enqueue(skb, child, to_free); + err = child->ops->enqueue(skb, child, root_lock, to_free); if (err != NET_XMIT_SUCCESS) return err; @@ -101,16 +101,16 @@ static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } -static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc = q->qdisc; - return cbs_child_enqueue(skb, sch, qdisc, to_free); + return cbs_child_enqueue(skb, sch, qdisc, root_lock, to_free); } -static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); @@ -124,15 +124,15 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, q->last = ktime_get_ns(); } - return cbs_child_enqueue(skb, sch, qdisc, to_free); + return cbs_child_enqueue(skb, sch, qdisc, root_lock, to_free); } -static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); - return q->enqueue(skb, sch, to_free); + return q->enqueue(skb, sch, root_lock, to_free); } /* timediff is in ns, slope is in bytes/s */ diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index bd618b00d3193..baf3faee31aac 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -210,7 +210,7 @@ static bool choke_match_random(const struct choke_sched_data *q, return choke_match_flow(oskb, nskb); } -static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 30169b3adbbb0..1d94837abdd83 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -108,7 +108,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) return skb; } -static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct codel_sched_data *q; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 07a2b0b354954..0d5c9a8ec61da 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -337,7 +337,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, return NULL; } -static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); @@ -355,7 +355,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, } first = !cl->qdisc->q.qlen; - err = qdisc_enqueue(skb, cl->qdisc, to_free); + err = qdisc_enqueue(skb, cl->qdisc, root_lock, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 05605b30bef3a..fbe49fffcdbb3 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -198,7 +198,7 @@ static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); @@ -267,7 +267,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - err = qdisc_enqueue(skb, p->q, to_free); + err = qdisc_enqueue(skb, p->q, root_lock, to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index c48f91075b5c6..7a7c50a681154 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -160,7 +160,7 @@ static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) } static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, - struct sk_buff **to_free) + spinlock_t *root_lock, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index a87e9159338c4..373dc5855d4e8 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -415,7 +415,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, return &q->classes[band]; } -static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); @@ -433,7 +433,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } first = !cl->qdisc->q.qlen; - err = qdisc_enqueue(skb, cl->qdisc, to_free); + err = qdisc_enqueue(skb, cl->qdisc, root_lock, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index a579a4131d22d..b4da5b624ad86 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -16,7 +16,7 @@ /* 1 band FIFO pseudo-"scheduler" */ -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) @@ -25,7 +25,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { if (likely(sch->q.qlen < sch->limit)) @@ -34,7 +34,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int prev_backlog; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 2fb76fc0cc31b..a90d745c41e05 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -439,7 +439,7 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); } -static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct fq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 459a784056c0f..6bf979f955091 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -181,7 +181,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, return idx; } -static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index fb760cee824e4..a27a250ab8f9a 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -125,7 +125,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, skb->next = NULL; } -static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct fq_pie_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 265a61d011dfa..715cde1df9e46 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -520,7 +520,7 @@ EXPORT_SYMBOL(netif_carrier_off); cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, spinlock_t *root_lock, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); @@ -614,7 +614,7 @@ static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, return &priv->q[band]; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, spinlock_t *root_lock, struct sk_buff **to_free) { int band = prio2band[skb->priority & TC_PRIO_MAX]; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8599c6f31b057..7d67c6cd6605b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -161,7 +161,7 @@ static bool gred_per_vq_red_flags_used(struct gred_sched *table) return false; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct gred_sched_data *q = NULL; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 433f2190960fe..7f6670044f0a9 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1528,8 +1528,8 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) return -1; } -static int -hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) +static int hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, + struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; @@ -1545,7 +1545,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } first = !cl->qdisc->q.qlen; - err = qdisc_enqueue(skb, cl->qdisc, to_free); + err = qdisc_enqueue(skb, cl->qdisc, root_lock, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 420ede8753229..ddc6bf1d85d0e 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -368,7 +368,7 @@ static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) return bucket - q->buckets; } -static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8184c87da8bec..52fc513688b1b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -576,7 +576,7 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) cl->prio_activity = 0; } -static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { int uninitialized_var(ret); @@ -599,7 +599,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->leaf.q, + } else if ((ret = qdisc_enqueue(skb, cl->leaf.q, root_lock, to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 1330ad2249317..648611f5c1052 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -57,7 +57,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct Qdisc *qdisc; @@ -74,7 +74,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } #endif - ret = qdisc_enqueue(skb, qdisc, to_free); + ret = qdisc_enqueue(skb, qdisc, root_lock, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 84f82771cdf5d..8fb17483a34f3 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -431,7 +431,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ -static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); @@ -480,7 +480,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; - rootq->enqueue(skb2, rootq, to_free); + rootq->enqueue(skb2, rootq, root_lock, to_free); q->duplicate = dupsave; rc_drop = NET_XMIT_SUCCESS; } @@ -604,7 +604,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; - rc = qdisc_enqueue(segs, sch, to_free); + rc = qdisc_enqueue(segs, sch, root_lock, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); @@ -720,7 +720,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct sk_buff *to_free = NULL; int err; - err = qdisc_enqueue(skb, q->qdisc, &to_free); + err = qdisc_enqueue(skb, q->qdisc, NULL, &to_free); kfree_skb_list(to_free); if (err != NET_XMIT_SUCCESS && net_xmit_drop_count(err)) { diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index c65077f0c0f39..b305313b64e30 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -82,7 +82,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, } EXPORT_SYMBOL_GPL(pie_drop_early); -static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index cbc2ebca4548c..e5f8b4769b4d6 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -84,7 +84,7 @@ struct plug_sched_data { u32 pkts_to_release; }; -static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 647941702f9fc..a3e187f2603c0 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -65,8 +65,8 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return q->queues[band]; } -static int -prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) +static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, + struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; @@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } #endif - ret = qdisc_enqueue(skb, qdisc, to_free); + ret = qdisc_enqueue(skb, qdisc, root_lock, to_free); if (ret == NET_XMIT_SUCCESS) { sch->qstats.backlog += len; sch->q.qlen++; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 0b05ac7c848eb..ede854516825c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1194,7 +1194,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb), gso_segs; @@ -1225,7 +1225,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; first = !cl->qdisc->q.qlen; - err = qdisc_enqueue(skb, cl->qdisc, to_free); + err = qdisc_enqueue(skb, cl->qdisc, root_lock, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 555a1b9e467fc..6ace7d757e8b4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -65,7 +65,7 @@ static int red_use_nodrop(struct red_sched_data *q) return q->flags & TC_RED_NODROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct red_sched_data *q = qdisc_priv(sch); @@ -118,7 +118,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; } - ret = qdisc_enqueue(skb, child, to_free); + ret = qdisc_enqueue(skb, child, root_lock, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 4074c50ac3d73..d2a6e78262bbc 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -276,7 +276,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, return false; } -static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { @@ -399,7 +399,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: - ret = qdisc_enqueue(skb, child, to_free); + ret = qdisc_enqueue(skb, child, root_lock, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 5a6def5e4e6df..46cdefd69e44d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q) } static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 7a5e4c4547156..f75f237c44369 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -65,7 +65,7 @@ static u16 calc_new_low_prio(const struct skbprio_sched_data *q) return SKBPRIO_MAX_PRIORITY - 1; } -static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e981992634ddf..daef2ff60a985 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -410,7 +410,7 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) return txtime; } -static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); @@ -435,7 +435,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; - return qdisc_enqueue(skb, child, to_free); + return qdisc_enqueue(skb, child, root_lock, to_free); } static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 78e79029dc631..c3eb5cdb83a82 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -187,7 +187,7 @@ static int tbf_offload_dump(struct Qdisc *sch) /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ -static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -206,7 +206,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; - ret = qdisc_enqueue(segs, q->qdisc, to_free); + ret = qdisc_enqueue(segs, q->qdisc, root_lock, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); @@ -221,7 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -231,10 +231,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_validate_mac_len(skb, q->max_size)) - return tbf_segment(skb, sch, to_free); + return tbf_segment(skb, sch, root_lock, to_free); return qdisc_drop(skb, sch, to_free); } - ret = qdisc_enqueue(skb, q->qdisc, to_free); + ret = qdisc_enqueue(skb, q->qdisc, root_lock, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 689ef6f3ded80..5119646534761 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -72,8 +72,8 @@ struct teql_sched_data { /* "teql*" qdisc routines */ -static int -teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) +static int teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock, + struct sk_buff **to_free) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); From 3625750f05ecce21a0fce429c1ff85acfffb461b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:26 +0300 Subject: [PATCH 2/5] net: sched: Introduce helpers for qevent blocks Qevents are attach points for TC blocks, where filters can be put that are executed when "interesting events" take place in a qdisc. The data to keep and the functions to invoke to maintain a qevent will be largely the same between qevents. Therefore introduce sched-wide helpers for qevent management. Currently, similarly to ingress and egress blocks of clsact pseudo-qdisc, blocks attachment cannot be changed after the qdisc is created. To that end, add a helper tcf_qevent_validate_change(), which verifies whether block index attribute is not attached, or if it is, whether its value matches the current one (i.e. there is no material change). The function tcf_qevent_handle() should be invoked when qdisc hits the "interesting event" corresponding to a block. This function releases root lock for the duration of executing the attached filters, to allow packets generated through user actions (notably mirred) to be reinserted to the same qdisc tree. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 49 +++++++++++++++++ net/sched/cls_api.c | 119 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ff017e5b3ea27..690a7f49c8f90 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -32,6 +32,12 @@ struct tcf_block_ext_info { u32 block_index; }; +struct tcf_qevent { + struct tcf_block *block; + struct tcf_block_ext_info info; + struct tcf_proto __rcu *filter_chain; +}; + struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); @@ -553,6 +559,49 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); +#ifdef CONFIG_NET_CLS_ACT +int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, + enum flow_block_binder_type binder_type, + struct nlattr *block_index_attr, + struct netlink_ext_ack *extack); +void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); +int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, + struct netlink_ext_ack *extack); +struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, + spinlock_t *root_lock, struct sk_buff **to_free, int *ret); +int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); +#else +static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, + enum flow_block_binder_type binder_type, + struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) +{ +} + +static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static inline struct sk_buff * +tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, + spinlock_t *root_lock, struct sk_buff **to_free, int *ret) +{ + return skb; +} + +static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) +{ + return 0; +} +#endif + struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5bfa6b985bb89..1b14d5f57e7f6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3748,6 +3748,125 @@ unsigned int tcf_exts_num_actions(struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_num_actions); +#ifdef CONFIG_NET_CLS_ACT +static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr, + u32 *p_block_index, + struct netlink_ext_ack *extack) +{ + *p_block_index = nla_get_u32(block_index_attr); + if (!*p_block_index) { + NL_SET_ERR_MSG(extack, "Block number may not be zero"); + return -EINVAL; + } + + return 0; +} + +int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, + enum flow_block_binder_type binder_type, + struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + if (!block_index) + return 0; + + qe->info.binder_type = binder_type; + qe->info.chain_head_change = tcf_chain_head_change_dflt; + qe->info.chain_head_change_priv = &qe->filter_chain; + qe->info.block_index = block_index; + + return tcf_block_get_ext(&qe->block, sch, &qe->info, extack); +} +EXPORT_SYMBOL(tcf_qevent_init); + +void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) +{ + if (qe->info.block_index) + tcf_block_put_ext(qe->block, sch, &qe->info); +} +EXPORT_SYMBOL(tcf_qevent_destroy); + +int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + /* Bounce newly-configured block or change in block. */ + if (block_index != qe->info.block_index) { + NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(tcf_qevent_validate_change); + +struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, + spinlock_t *root_lock, struct sk_buff **to_free, int *ret) +{ + struct tcf_result cl_res; + struct tcf_proto *fl; + + if (!qe->info.block_index) + return skb; + + fl = rcu_dereference_bh(qe->filter_chain); + + if (root_lock) + spin_unlock(root_lock); + + switch (tcf_classify(skb, fl, &cl_res, false)) { + case TC_ACT_SHOT: + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_BYPASS; + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_STOLEN; + return NULL; + case TC_ACT_REDIRECT: + skb_do_redirect(skb); + *ret = __NET_XMIT_STOLEN; + return NULL; + } + + if (root_lock) + spin_lock(root_lock); + + return skb; +} +EXPORT_SYMBOL(tcf_qevent_handle); + +int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) +{ + if (!qe->info.block_index) + return 0; + return nla_put_u32(skb, attr_name, qe->info.block_index); +} +EXPORT_SYMBOL(tcf_qevent_dump); +#endif + static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); From 65545ea24998bb9aab1ce713a67c693dc7a947ec Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:27 +0300 Subject: [PATCH 3/5] net: sched: sch_red: Split init and change callbacks In the following patches, RED will get two qevents. The implementation will be clearer if the callback for change is not a pure subset of the callback for init. Split the two and promote attribute parsing to the callbacks themselves from the common code, because it will be handy there. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- net/sched/sch_red.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6ace7d757e8b4..225ce370e5a8b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -215,12 +215,11 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), }; -static int red_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int __red_change(struct Qdisc *sch, struct nlattr **tb, + struct netlink_ext_ack *extack) { struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_RED_MAX + 1]; struct nla_bitfield32 flags_bf; struct tc_red_qopt *ctl; unsigned char userbits; @@ -228,14 +227,6 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, int err; u32 max_P; - if (opt == NULL) - return -EINVAL; - - err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, - NULL); - if (err < 0) - return err; - if (tb[TCA_RED_PARMS] == NULL || tb[TCA_RED_STAB] == NULL) return -EINVAL; @@ -323,11 +314,38 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return red_change(sch, opt, extack); + return __red_change(sch, tb, extack); +} + +static int red_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; + + return __red_change(sch, tb, extack); } static int red_dump_offload_stats(struct Qdisc *sch) From aee9caa03fc3c8b02f8f31824354d85f30e562e0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:28 +0300 Subject: [PATCH 4/5] net: sched: sch_red: Add qevents "early_drop" and "mark" In order to allow acting on dropped and/or ECN-marked packets, add two new qevents to the RED qdisc: "early_drop" and "mark". Filters attached at "early_drop" block are executed as packets are early-dropped, those attached at the "mark" block are executed as packets are ECN-marked. Two new attributes are introduced: TCA_RED_EARLY_DROP_BLOCK with the block index for the "early_drop" qevent, and TCA_RED_MARK_BLOCK for the "mark" qevent. Absence of these attributes signifies "don't care": no block is allocated in that case, or the existing blocks are left intact in case of the change callback. For purposes of offloading, blocks attached to these qevents appear with newly-introduced binder types, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP and FLOW_BLOCK_BINDER_TYPE_RED_MARK. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/flow_offload.h | 2 ++ include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_red.c | 58 ++++++++++++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bafb5124ac08..3e793ac66baf7 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -424,6 +424,8 @@ enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a95f3ae7ab37c..9e7c2c6078456 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -257,6 +257,8 @@ enum { TCA_RED_STAB, TCA_RED_MAX_P, TCA_RED_FLAGS, /* bitfield32 */ + TCA_RED_EARLY_DROP_BLOCK, /* u32 */ + TCA_RED_MARK_BLOCK, /* u32 */ __TCA_RED_MAX, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 225ce370e5a8b..de2be4d04ed6b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,6 +227,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; static int __red_change(struct Qdisc *sch, struct nlattr **tb, @@ -328,12 +344,38 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return __red_change(sch, tb, extack); + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; } static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; int err; @@ -345,6 +387,16 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + return __red_change(sch, tb, extack); } @@ -389,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); From 6cf0291f95172db68d8a283854389a1966e43c65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:29 +0300 Subject: [PATCH 5/5] selftests: forwarding: Add a RED test for SW datapath This test is inspired by the mlxsw RED selftest. It is much simpler to set up (also because there is no point in testing PRIO / RED encapsulation). It tests bare RED, ECN and ECN+nodrop modes of operation. On top of that it tests RED early_drop and mark qevents. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/net/forwarding/sch_red.sh | 492 ++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/sch_red.sh diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh new file mode 100755 index 0000000000000..e714bae473fb4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -0,0 +1,492 @@ +# SPDX-License-Identifier: GPL-2.0 + +# This test sends one stream of traffic from H1 through a TBF shaper, to a RED +# within TBF shaper on $swp3. The two shapers have the same configuration, and +# thus the resulting stream should fill all available bandwidth on the latter +# shaper. A second stream is sent from H2 also via $swp3, and used to inject +# additional traffic. Since all available bandwidth is taken, this traffic has +# to go to backlog. +# +# +--------------------------+ +--------------------------+ +# | H1 | | H2 | +# | + $h1 | | + $h2 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | TBF 10Mbps | | | | +# +-----|--------------------+ +-----|--------------------+ +# | | +# +-----|------------------------------------------------|--------------------+ +# | SW | | | +# | +--|------------------------------------------------|----------------+ | +# | | + $swp1 + $swp2 | | +# | | BR | | +# | | | | +# | | + $swp3 | | +# | | | TBF 10Mbps / RED | | +# | +--------------------------------|-----------------------------------+ | +# | | | +# +-----------------------------------|---------------------------------------+ +# | +# +-----|--------------------+ +# | H3 | | +# | + $h1 | +# | 192.0.2.3/28 | +# | | +# +--------------------------+ + +ALL_TESTS=" + ping_ipv4 + ecn_test + ecn_nodrop_test + red_test + red_qevent_test + ecn_qevent_test +" + +NUM_NETIFS=6 +CHECK_TC="yes" +source lib.sh + +BACKLOG=30000 +PKTSZ=1400 + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + mtu_set $h1 10000 + tc qdisc replace dev $h1 root handle 1: tbf \ + rate 10Mbit burst 10K limit 1M +} + +h1_destroy() +{ + tc qdisc del dev $h1 root + mtu_restore $h1 + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/28 + mtu_set $h2 10000 +} + +h2_destroy() +{ + mtu_restore $h2 + simple_if_fini $h2 192.0.2.2/28 +} + +h3_create() +{ + simple_if_init $h3 192.0.2.3/28 + mtu_set $h3 10000 +} + +h3_destroy() +{ + mtu_restore $h3 + simple_if_fini $h3 192.0.2.3/28 +} + +switch_create() +{ + ip link add dev br up type bridge + ip link set dev $swp1 up master br + ip link set dev $swp2 up master br + ip link set dev $swp3 up master br + + mtu_set $swp1 10000 + mtu_set $swp2 10000 + mtu_set $swp3 10000 + + tc qdisc replace dev $swp3 root handle 1: tbf \ + rate 10Mbit burst 10K limit 1M + ip link add name _drop_test up type dummy +} + +switch_destroy() +{ + ip link del dev _drop_test + tc qdisc del dev $swp3 root + + mtu_restore $h3 + mtu_restore $h2 + mtu_restore $h1 + + ip link set dev $swp3 down nomaster + ip link set dev $swp2 down nomaster + ip link set dev $swp1 down nomaster + ip link del dev br +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + h2=${NETIFS[p3]} + swp2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + h3_mac=$(mac_get $h3) + + vrf_prepare + + h1_create + h2_create + h3_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.3 " from host 1" + ping_test $h2 192.0.2.3 " from host 2" +} + +get_qdisc_backlog() +{ + qdisc_stats_get $swp3 11: .backlog +} + +get_nmarked() +{ + qdisc_stats_get $swp3 11: .marked +} + +get_qdisc_npackets() +{ + qdisc_stats_get $swp3 11: .packets +} + +get_nmirrored() +{ + link_stats_get _drop_test tx packets +} + +send_packets() +{ + local proto=$1; shift + local pkts=$1; shift + + $MZ $h2 -p $PKTSZ -a own -b $h3_mac -A 192.0.2.2 -B 192.0.2.3 -t $proto -q -c $pkts "$@" +} + +# This sends traffic in an attempt to build a backlog of $size. Returns 0 on +# success. After 10 failed attempts it bails out and returns 1. It dumps the +# backlog size to stdout. +build_backlog() +{ + local size=$1; shift + local proto=$1; shift + + local i=0 + + while :; do + local cur=$(get_qdisc_backlog) + local diff=$((size - cur)) + local pkts=$(((diff + PKTSZ - 1) / PKTSZ)) + + if ((cur >= size)); then + echo $cur + return 0 + elif ((i++ > 10)); then + echo $cur + return 1 + fi + + send_packets $proto $pkts "$@" + sleep 1 + done +} + +check_marking() +{ + local cond=$1; shift + + local npackets_0=$(get_qdisc_npackets) + local nmarked_0=$(get_nmarked) + sleep 5 + local npackets_1=$(get_qdisc_npackets) + local nmarked_1=$(get_nmarked) + + local nmarked_d=$((nmarked_1 - nmarked_0)) + local npackets_d=$((npackets_1 - npackets_0)) + local pct=$((100 * nmarked_d / npackets_d)) + + echo $pct + ((pct $cond)) +} + +check_mirroring() +{ + local cond=$1; shift + + local npackets_0=$(get_qdisc_npackets) + local nmirrored_0=$(get_nmirrored) + sleep 5 + local npackets_1=$(get_qdisc_npackets) + local nmirrored_1=$(get_nmirrored) + + local nmirrored_d=$((nmirrored_1 - nmirrored_0)) + local npackets_d=$((npackets_1 - npackets_0)) + local pct=$((100 * nmirrored_d / npackets_d)) + + echo $pct + ((pct $cond)) +} + +ecn_test_common() +{ + local name=$1; shift + local limit=$1; shift + local backlog + local pct + + # Build the below-the-limit backlog using UDP. We could use TCP just + # fine, but this way we get a proof that UDP is accepted when queue + # length is below the limit. The main stream is using TCP, and if the + # limit is misconfigured, we would see this traffic being ECN marked. + RET=0 + backlog=$(build_backlog $((2 * limit / 3)) udp) + check_err $? "Could not build the requested backlog" + pct=$(check_marking "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "$name backlog < limit" + + # Now push TCP, because non-TCP traffic would be early-dropped after the + # backlog crosses the limit, and we want to make sure that the backlog + # is above the limit. + RET=0 + backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking ">= 95") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95." + log_test "$name backlog > limit" +} + +do_ecn_test() +{ + local limit=$1; shift + local name=ECN + + $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ + -a own -b $h3_mac -t tcp -q tos=0x01 & + sleep 1 + + ecn_test_common "$name" $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, it should all get dropped, and backlog + # building should fail. + RET=0 + build_backlog $((2 * limit)) udp >/dev/null + check_fail $? "UDP traffic went into backlog instead of being early-dropped" + log_test "$name backlog > limit: UDP early-dropped" + + stop_traffic + sleep 1 +} + +do_ecn_nodrop_test() +{ + local limit=$1; shift + local name="ECN nodrop" + + $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ + -a own -b $h3_mac -t tcp -q tos=0x01 & + sleep 1 + + ecn_test_common "$name" $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, in nodrop mode, make sure it goes to + # backlog as well. + RET=0 + build_backlog $((2 * limit)) udp >/dev/null + check_err $? "UDP traffic was early-dropped instead of getting into backlog" + log_test "$name backlog > limit: UDP not dropped" + + stop_traffic + sleep 1 +} + +do_red_test() +{ + local limit=$1; shift + local backlog + local pct + + # Use ECN-capable TCP to verify there's no marking even though the queue + # is above limit. + $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ + -a own -b $h3_mac -t tcp -q tos=0x01 & + + # Pushing below the queue limit should work. + RET=0 + backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "RED backlog < limit" + + # Pushing above should not. + RET=0 + backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01) + check_fail $? "Traffic went into backlog instead of being early-dropped" + pct=$(check_marking "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "RED backlog > limit" + + stop_traffic + sleep 1 +} + +do_red_qevent_test() +{ + local limit=$1; shift + local backlog + local base + local now + local pct + + RET=0 + + $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ + -a own -b $h3_mac -t udp -q & + sleep 1 + + tc filter add block 10 pref 1234 handle 102 matchall skip_hw \ + action mirred egress mirror dev _drop_test + + # Push to the queue until it's at the limit. The configured limit is + # rounded by the qdisc, so this is the best we can do to get to the real + # limit. + build_backlog $((3 * limit / 2)) udp >/dev/null + + base=$(get_nmirrored) + send_packets udp 100 + sleep 1 + now=$(get_nmirrored) + ((now >= base + 100)) + check_err $? "Dropped packets not observed: 100 expected, $((now - base)) seen" + + tc filter del block 10 pref 1234 handle 102 matchall + + base=$(get_nmirrored) + send_packets udp 100 + sleep 1 + now=$(get_nmirrored) + ((now == base)) + check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen" + + log_test "RED early_dropped packets mirrored" + + stop_traffic + sleep 1 +} + +do_ecn_qevent_test() +{ + local limit=$1; shift + local name=ECN + + RET=0 + + $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ + -a own -b $h3_mac -t tcp -q tos=0x01 & + sleep 1 + + tc filter add block 10 pref 1234 handle 102 matchall skip_hw \ + action mirred egress mirror dev _drop_test + + backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_mirroring "== 0") + check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected == 0." + + backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_mirroring ">= 95") + check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected >= 95." + + tc filter del block 10 pref 1234 handle 102 matchall + + log_test "ECN marked packets mirrored" + + stop_traffic + sleep 1 +} + +install_qdisc() +{ + local -a args=("$@") + + tc qdisc replace dev $swp3 parent 1:1 handle 11: red \ + limit 1M avpkt $PKTSZ probability 1 \ + min $BACKLOG max $((BACKLOG + 1)) burst 38 "${args[@]}" + sleep 1 +} + +uninstall_qdisc() +{ + tc qdisc del dev $swp3 parent 1:1 +} + +ecn_test() +{ + install_qdisc ecn + do_ecn_test $BACKLOG + uninstall_qdisc +} + +ecn_nodrop_test() +{ + install_qdisc ecn nodrop + do_ecn_nodrop_test $BACKLOG + uninstall_qdisc +} + +red_test() +{ + install_qdisc + do_red_test $BACKLOG + uninstall_qdisc +} + +red_qevent_test() +{ + install_qdisc qevent early_drop block 10 + do_red_qevent_test $BACKLOG + uninstall_qdisc +} + +ecn_qevent_test() +{ + install_qdisc ecn qevent mark block 10 + do_ecn_qevent_test $BACKLOG + uninstall_qdisc +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS