linux_dsm_epyc7002/net/packet/diag.c

266 lines
6.3 KiB
C
Raw Normal View History

#include <linux/module.h>
#include <linux/sock_diag.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/packet_diag.h>
packet: use percpu mmap tx frame pending refcount In PF_PACKET's packet mmap(), we can avoid using one atomic_inc() and one atomic_dec() call in skb destructor and use a percpu reference count instead in order to determine if packets are still pending to be sent out. Micro-benchmark with [1] that has been slightly modified (that is, protcol = 0 in socket(2) and bind(2)), example on a rather crappy testing machine; I expect it to scale and have even better results on bigger machines: ./packet_mm_tx -s7000 -m7200 -z700000 em1, avg over 2500 runs: With patch: 4,022,015 cyc Without patch: 4,812,994 cyc time ./packet_mm_tx -s64 -c10000000 em1 > /dev/null, stable: With patch: real 1m32.241s user 0m0.287s sys 1m29.316s Without patch: real 1m38.386s user 0m0.265s sys 1m35.572s In function tpacket_snd(), it is okay to use packet_read_pending() since in fast-path we short-circuit the condition already with ph != NULL, since we have next frames to process. In case we have MSG_DONTWAIT, we also do not execute this path as need_wait is false here anyway, and in case of _no_ MSG_DONTWAIT flag, it is okay to call a packet_read_pending(), because when we ever reach that path, we're done processing outgoing frames anyway and only look if there are skbs still outstanding to be orphaned. We can stay lockless in this percpu counter since it's acceptable when we reach this path for the sum to be imprecise first, but we'll level out at 0 after all pending frames have reached the skb destructor eventually through tx reclaim. When people pin a tx process to particular CPUs, we expect overflows to happen in the reference counter as on one CPU we expect heavy increase; and distributed through ksoftirqd on all CPUs a decrease, for example. As David Laight points out, since the C language doesn't define the result of signed int overflow (i.e. rather than wrap, it is allowed to saturate as a possible outcome), we have to use unsigned int as reference count. The sum over all CPUs when tx is complete will result in 0 again. The BUG_ON() in tpacket_destruct_skb() we can remove as well. It can _only_ be set from inside tpacket_snd() path and we made sure to increase tx_ring.pending in any case before we called po->xmit(skb). So testing for tx_ring.pending == 0 is not too useful. Instead, it would rather have been useful to test if lower layers didn't orphan the skb so that we're missing ring slots being put back to TP_STATUS_AVAILABLE. But such a bug will be caught in user space already as we end up realizing that we do not have any TP_STATUS_AVAILABLE slots left anymore. Therefore, we're all set. Btw, in case of RX_RING path, we do not make use of the pending member, therefore we also don't need to use up any percpu memory here. Also note that __alloc_percpu() already returns a zero-filled percpu area, so initialization is done already. [1] http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-15 22:25:36 +07:00
#include <linux/percpu.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include "internal.h"
static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
{
struct packet_diag_info pinfo;
pinfo.pdi_index = po->ifindex;
pinfo.pdi_version = po->tp_version;
pinfo.pdi_reserve = po->tp_reserve;
pinfo.pdi_copy_thresh = po->copy_thresh;
pinfo.pdi_tstamp = po->tp_tstamp;
pinfo.pdi_flags = 0;
if (po->running)
pinfo.pdi_flags |= PDI_RUNNING;
if (po->auxdata)
pinfo.pdi_flags |= PDI_AUXDATA;
if (po->origdev)
pinfo.pdi_flags |= PDI_ORIGDEV;
if (po->has_vnet_hdr)
pinfo.pdi_flags |= PDI_VNETHDR;
if (po->tp_loss)
pinfo.pdi_flags |= PDI_LOSS;
return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo);
}
static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb)
{
struct nlattr *mca;
struct packet_mclist *ml;
mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST);
if (!mca)
return -EMSGSIZE;
rtnl_lock();
for (ml = po->mclist; ml; ml = ml->next) {
struct packet_diag_mclist *dml;
dml = nla_reserve_nohdr(nlskb, sizeof(*dml));
if (!dml) {
rtnl_unlock();
nla_nest_cancel(nlskb, mca);
return -EMSGSIZE;
}
dml->pdmc_index = ml->ifindex;
dml->pdmc_type = ml->type;
dml->pdmc_alen = ml->alen;
dml->pdmc_count = ml->count;
BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr));
memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr));
}
rtnl_unlock();
nla_nest_end(nlskb, mca);
return 0;
}
static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
struct sk_buff *nlskb)
{
struct packet_diag_ring pdr;
if (!ring->pg_vec || ((ver > TPACKET_V2) &&
(nl_type == PACKET_DIAG_TX_RING)))
return 0;
pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
pdr.pdr_block_nr = ring->pg_vec_len;
pdr.pdr_frame_size = ring->frame_size;
pdr.pdr_frame_nr = ring->frame_max + 1;
if (ver > TPACKET_V2) {
pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
pdr.pdr_features = ring->prb_bdqc.feature_req_word;
} else {
pdr.pdr_retire_tmo = 0;
pdr.pdr_sizeof_priv = 0;
pdr.pdr_features = 0;
}
return nla_put(nlskb, nl_type, sizeof(pdr), &pdr);
}
static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb)
{
int ret;
mutex_lock(&po->pg_vec_lock);
ret = pdiag_put_ring(&po->rx_ring, po->tp_version,
PACKET_DIAG_RX_RING, skb);
if (!ret)
ret = pdiag_put_ring(&po->tx_ring, po->tp_version,
PACKET_DIAG_TX_RING, skb);
mutex_unlock(&po->pg_vec_lock);
return ret;
}
static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb)
{
int ret = 0;
mutex_lock(&fanout_mutex);
if (po->fanout) {
u32 val;
val = (u32)po->fanout->id | ((u32)po->fanout->type << 16);
ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val);
}
mutex_unlock(&fanout_mutex);
return ret;
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
struct packet_diag_req *req,
bool may_report_filterinfo,
struct user_namespace *user_ns,
u32 portid, u32 seq, u32 flags, int sk_ino)
{
struct nlmsghdr *nlh;
struct packet_diag_msg *rp;
struct packet_sock *po = pkt_sk(sk);
nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags);
if (!nlh)
return -EMSGSIZE;
rp = nlmsg_data(nlh);
rp->pdiag_family = AF_PACKET;
rp->pdiag_type = sk->sk_type;
rp->pdiag_num = ntohs(po->num);
rp->pdiag_ino = sk_ino;
sock_diag_save_cookie(sk, rp->pdiag_cookie);
if ((req->pdiag_show & PACKET_SHOW_INFO) &&
pdiag_put_info(po, skb))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_INFO) &&
nla_put_u32(skb, PACKET_DIAG_UID,
from_kuid_munged(user_ns, sock_i_uid(sk))))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
pdiag_put_mclist(po, skb))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_RING_CFG) &&
pdiag_put_rings_cfg(po, skb))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_FANOUT) &&
pdiag_put_fanout(po, skb))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_MEMINFO) &&
sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO))
goto out_nlmsg_trim;
if ((req->pdiag_show & PACKET_SHOW_FILTER) &&
sock_diag_put_filterinfo(may_report_filterinfo, sk, skb,
PACKET_DIAG_FILTER))
goto out_nlmsg_trim;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 04:09:00 +07:00
nlmsg_end(skb, nlh);
return 0;
out_nlmsg_trim:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int num = 0, s_num = cb->args[0];
struct packet_diag_req *req;
struct net *net;
struct sock *sk;
bool may_report_filterinfo;
net = sock_net(skb->sk);
req = nlmsg_data(cb->nlh);
may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
packet: Protect packet sk list with mutex (v2) Change since v1: * Fixed inuse counters access spotted by Eric In patch eea68e2f (packet: Report socket mclist info via diag module) I've introduced a "scheduling in atomic" problem in packet diag module -- the socket list is traversed under rcu_read_lock() while performed under it sk mclist access requires rtnl lock (i.e. -- mutex) to be taken. [152363.820563] BUG: scheduling while atomic: crtools/12517/0x10000002 [152363.820573] 4 locks held by crtools/12517: [152363.820581] #0: (sock_diag_mutex){+.+.+.}, at: [<ffffffff81a2dcb5>] sock_diag_rcv+0x1f/0x3e [152363.820613] #1: (sock_diag_table_mutex){+.+.+.}, at: [<ffffffff81a2de70>] sock_diag_rcv_msg+0xdb/0x11a [152363.820644] #2: (nlk->cb_mutex){+.+.+.}, at: [<ffffffff81a67d01>] netlink_dump+0x23/0x1ab [152363.820693] #3: (rcu_read_lock){.+.+..}, at: [<ffffffff81b6a049>] packet_diag_dump+0x0/0x1af Similar thing was then re-introduced by further packet diag patches (fanount mutex and pgvec mutex for rings) :( Apart from being terribly sorry for the above, I propose to change the packet sk list protection from spinlock to mutex. This lock currently protects two modifications: * sklist * prot inuse counters The sklist modifications can be just reprotected with mutex since they already occur in a sleeping context. The inuse counters modifications are trickier -- the __this_cpu_-s are used inside, thus requiring the caller to handle the potential issues with contexts himself. Since packet sockets' counters are modified in two places only (packet_create and packet_release) we only need to protect the context from being preempted. BH disabling is not required in this case. Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-08-21 08:06:47 +07:00
mutex_lock(&net->packet.sklist_lock);
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 08:06:00 +07:00
sk_for_each(sk, &net->packet.sklist) {
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num)
goto next;
if (sk_diag_fill(sk, skb, req,
may_report_filterinfo,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
sock_i_ino(sk)) < 0)
goto done;
next:
num++;
}
done:
packet: Protect packet sk list with mutex (v2) Change since v1: * Fixed inuse counters access spotted by Eric In patch eea68e2f (packet: Report socket mclist info via diag module) I've introduced a "scheduling in atomic" problem in packet diag module -- the socket list is traversed under rcu_read_lock() while performed under it sk mclist access requires rtnl lock (i.e. -- mutex) to be taken. [152363.820563] BUG: scheduling while atomic: crtools/12517/0x10000002 [152363.820573] 4 locks held by crtools/12517: [152363.820581] #0: (sock_diag_mutex){+.+.+.}, at: [<ffffffff81a2dcb5>] sock_diag_rcv+0x1f/0x3e [152363.820613] #1: (sock_diag_table_mutex){+.+.+.}, at: [<ffffffff81a2de70>] sock_diag_rcv_msg+0xdb/0x11a [152363.820644] #2: (nlk->cb_mutex){+.+.+.}, at: [<ffffffff81a67d01>] netlink_dump+0x23/0x1ab [152363.820693] #3: (rcu_read_lock){.+.+..}, at: [<ffffffff81b6a049>] packet_diag_dump+0x0/0x1af Similar thing was then re-introduced by further packet diag patches (fanount mutex and pgvec mutex for rings) :( Apart from being terribly sorry for the above, I propose to change the packet sk list protection from spinlock to mutex. This lock currently protects two modifications: * sklist * prot inuse counters The sklist modifications can be just reprotected with mutex since they already occur in a sleeping context. The inuse counters modifications are trickier -- the __this_cpu_-s are used inside, thus requiring the caller to handle the potential issues with contexts himself. Since packet sockets' counters are modified in two places only (packet_create and packet_release) we only need to protect the context from being preempted. BH disabling is not required in this case. Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-08-21 08:06:47 +07:00
mutex_unlock(&net->packet.sklist_lock);
cb->args[0] = num;
return skb->len;
}
static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct packet_diag_req);
struct net *net = sock_net(skb->sk);
struct packet_diag_req *req;
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
req = nlmsg_data(h);
/* Make it possible to support protocol filtering later */
if (req->sdiag_protocol)
return -EINVAL;
if (h->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = packet_diag_dump,
};
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
} else
return -EOPNOTSUPP;
}
static const struct sock_diag_handler packet_diag_handler = {
.family = AF_PACKET,
.dump = packet_diag_handler_dump,
};
static int __init packet_diag_init(void)
{
return sock_diag_register(&packet_diag_handler);
}
static void __exit packet_diag_exit(void)
{
sock_diag_unregister(&packet_diag_handler);
}
module_init(packet_diag_init);
module_exit(packet_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);