linux_dsm_epyc7002/net/sched/sch_mq.c
Eric Dumazet edb09eb17e net: sched: do not acquire qdisc spinlock in qdisc/class stats dump
Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host
agent [1] are problematic at scale :

For each qdisc/class found in the dump, we currently lock the root qdisc
spinlock in order to get stats. Sampling stats every 5 seconds from
thousands of HTB classes is a challenge when the root qdisc spinlock is
under high pressure. Not only the dumps take time, they also slow
down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases.

An audit of existing qdiscs showed that sch_fq_codel is the only qdisc
that might need the qdisc lock in fq_codel_dump_stats() and
fq_codel_dump_class_stats()

In v2 of this patch, I now use the Qdisc running seqcount to provide
consistent reads of packets/bytes counters, regardless of 32/64 bit arches.

I also changed rate estimators to use the same infrastructure
so that they no longer need to lock root qdisc lock.

[1]
http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Kevin Athey <kda@google.com>
Cc: Xiaotian Pei <xiaotian@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-07 16:37:14 -07:00

247 lines
5.8 KiB
C

/*
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct mq_sched {
struct Qdisc **qdiscs;
};
static void mq_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
qdisc_destroy(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
static int mq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
unsigned int ntx;
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
if (!netif_is_multiqueue(dev))
return -EOPNOTSUPP;
/* pre-allocate qdiscs, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
if (priv->qdiscs == NULL)
return -ENOMEM;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
priv->qdiscs[ntx] = qdisc;
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
sch->flags |= TCQ_F_MQROOT;
return 0;
err:
mq_destroy(sch);
return -ENOMEM;
}
static void mq_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct Qdisc *qdisc, *old;
unsigned int ntx;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
qdisc_destroy(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
qdisc_list_add(qdisc);
#endif
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
}
static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
unsigned int ntx;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops += qdisc->qstats.drops;
sch->qstats.requeues += qdisc->qstats.requeues;
sch->qstats.overlimits += qdisc->qstats.overlimits;
spin_unlock_bh(qdisc_lock(qdisc));
}
return 0;
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx = cl - 1;
if (ntx >= dev->num_tx_queues)
return NULL;
return netdev_get_tx_queue(dev, ntx);
}
static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
struct tcmsg *tcm)
{
unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
if (!dev_queue) {
struct net_device *dev = qdisc_dev(sch);
return netdev_get_tx_queue(dev, 0);
}
return dev_queue;
}
static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
struct net_device *dev = qdisc_dev(sch);
if (dev->flags & IFF_UP)
dev_deactivate(dev);
*old = dev_graft_qdisc(dev_queue, new);
if (new)
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
}
static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
return dev_queue->qdisc_sleeping;
}
static unsigned long mq_get(struct Qdisc *sch, u32 classid)
{
unsigned int ntx = TC_H_MIN(classid);
if (!mq_queue_get(sch, ntx))
return 0;
return ntx;
}
static void mq_put(struct Qdisc *sch, unsigned long cl)
{
}
static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
return 0;
}
static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
return -1;
return 0;
}
static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx;
if (arg->stop)
return;
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
if (arg->fn(sch, ntx + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops mq_class_ops = {
.select_queue = mq_select_queue,
.graft = mq_graft,
.leaf = mq_leaf,
.get = mq_get,
.put = mq_put,
.walk = mq_walk,
.dump = mq_dump_class,
.dump_stats = mq_dump_class_stats,
};
struct Qdisc_ops mq_qdisc_ops __read_mostly = {
.cl_ops = &mq_class_ops,
.id = "mq",
.priv_size = sizeof(struct mq_sched),
.init = mq_init,
.destroy = mq_destroy,
.attach = mq_attach,
.dump = mq_dump,
.owner = THIS_MODULE,
};