linux_dsm_epyc7002/net/netlink/diag.c

260 lines
5.4 KiB
C
Raw Normal View History

#include <linux/module.h>
#include <net/sock.h>
#include <linux/netlink.h>
#include <linux/sock_diag.h>
#include <linux/netlink_diag.h>
netlink: Convert netlink_lookup() to use RCU protected hash table Heavy Netlink users such as Open vSwitch spend a considerable amount of time in netlink_lookup() due to the read-lock on nl_table_lock. Use of RCU relieves the lock contention. Makes use of the new resizable hash table to avoid locking on the lookup. The hash table will grow if entries exceeds 75% of table size up to a total table size of 64K. It will automatically shrink if usage falls below 30%. Also splits nl_table_lock into a separate mutex to protect hash table mutations and allow synchronize_rcu() to sleep while waiting for readers during expansion and shrinking. Before: 9.16% kpktgend_0 [openvswitch] [k] masked_flow_lookup 6.42% kpktgend_0 [pktgen] [k] mod_cur_headers 6.26% kpktgend_0 [pktgen] [k] pktgen_thread_worker 6.23% kpktgend_0 [kernel.kallsyms] [k] memset 4.79% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup 4.37% kpktgend_0 [kernel.kallsyms] [k] memcpy 3.60% kpktgend_0 [openvswitch] [k] ovs_flow_extract 2.69% kpktgend_0 [kernel.kallsyms] [k] jhash2 After: 15.26% kpktgend_0 [openvswitch] [k] masked_flow_lookup 8.12% kpktgend_0 [pktgen] [k] pktgen_thread_worker 7.92% kpktgend_0 [pktgen] [k] mod_cur_headers 5.11% kpktgend_0 [kernel.kallsyms] [k] memset 4.11% kpktgend_0 [openvswitch] [k] ovs_flow_extract 4.06% kpktgend_0 [kernel.kallsyms] [k] _raw_spin_lock 3.90% kpktgend_0 [kernel.kallsyms] [k] jhash2 [...] 0.67% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup Signed-off-by: Thomas Graf <tgraf@suug.ch> Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-02 16:47:45 +07:00
#include <linux/rhashtable.h>
#include "af_netlink.h"
static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (nlk->groups == NULL)
return 0;
return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups),
nlk->groups);
}
static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb)
{
struct netlink_sock *nlk = nlk_sk(sk);
u32 flags = 0;
if (nlk->cb_running)
flags |= NDIAG_FLAG_CB_RUNNING;
if (nlk->flags & NETLINK_F_RECV_PKTINFO)
flags |= NDIAG_FLAG_PKTINFO;
if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
flags |= NDIAG_FLAG_BROADCAST_ERROR;
if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)
flags |= NDIAG_FLAG_NO_ENOBUFS;
if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
flags |= NDIAG_FLAG_LISTEN_ALL_NSID;
if (nlk->flags & NETLINK_F_CAP_ACK)
flags |= NDIAG_FLAG_CAP_ACK;
return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags);
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
struct netlink_diag_req *req,
u32 portid, u32 seq, u32 flags, int sk_ino)
{
struct nlmsghdr *nlh;
struct netlink_diag_msg *rep;
struct netlink_sock *nlk = nlk_sk(sk);
nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
flags);
if (!nlh)
return -EMSGSIZE;
rep = nlmsg_data(nlh);
rep->ndiag_family = AF_NETLINK;
rep->ndiag_type = sk->sk_type;
rep->ndiag_protocol = sk->sk_protocol;
rep->ndiag_state = sk->sk_state;
rep->ndiag_ino = sk_ino;
rep->ndiag_portid = nlk->portid;
rep->ndiag_dst_portid = nlk->dst_portid;
rep->ndiag_dst_group = nlk->dst_group;
sock_diag_save_cookie(sk, rep->ndiag_cookie);
if ((req->ndiag_show & NDIAG_SHOW_GROUPS) &&
sk_diag_dump_groups(sk, skb))
goto out_nlmsg_trim;
if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) &&
sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
goto out_nlmsg_trim;
if ((req->ndiag_show & NDIAG_SHOW_FLAGS) &&
sk_diag_put_flags(sk, skb))
goto out_nlmsg_trim;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 04:09:00 +07:00
nlmsg_end(skb, nlh);
return 0;
out_nlmsg_trim:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
int protocol, int s_num)
{
struct rhashtable_iter *hti = (void *)cb->args[2];
struct netlink_table *tbl = &nl_table[protocol];
struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req;
netlink: Convert netlink_lookup() to use RCU protected hash table Heavy Netlink users such as Open vSwitch spend a considerable amount of time in netlink_lookup() due to the read-lock on nl_table_lock. Use of RCU relieves the lock contention. Makes use of the new resizable hash table to avoid locking on the lookup. The hash table will grow if entries exceeds 75% of table size up to a total table size of 64K. It will automatically shrink if usage falls below 30%. Also splits nl_table_lock into a separate mutex to protect hash table mutations and allow synchronize_rcu() to sleep while waiting for readers during expansion and shrinking. Before: 9.16% kpktgend_0 [openvswitch] [k] masked_flow_lookup 6.42% kpktgend_0 [pktgen] [k] mod_cur_headers 6.26% kpktgend_0 [pktgen] [k] pktgen_thread_worker 6.23% kpktgend_0 [kernel.kallsyms] [k] memset 4.79% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup 4.37% kpktgend_0 [kernel.kallsyms] [k] memcpy 3.60% kpktgend_0 [openvswitch] [k] ovs_flow_extract 2.69% kpktgend_0 [kernel.kallsyms] [k] jhash2 After: 15.26% kpktgend_0 [openvswitch] [k] masked_flow_lookup 8.12% kpktgend_0 [pktgen] [k] pktgen_thread_worker 7.92% kpktgend_0 [pktgen] [k] mod_cur_headers 5.11% kpktgend_0 [kernel.kallsyms] [k] memset 4.11% kpktgend_0 [openvswitch] [k] ovs_flow_extract 4.06% kpktgend_0 [kernel.kallsyms] [k] _raw_spin_lock 3.90% kpktgend_0 [kernel.kallsyms] [k] jhash2 [...] 0.67% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup Signed-off-by: Thomas Graf <tgraf@suug.ch> Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-02 16:47:45 +07:00
struct netlink_sock *nlsk;
struct sock *sk;
int num = 2;
int ret = 0;
req = nlmsg_data(cb->nlh);
if (s_num > 1)
goto mc_list;
num--;
netlink: Convert netlink_lookup() to use RCU protected hash table Heavy Netlink users such as Open vSwitch spend a considerable amount of time in netlink_lookup() due to the read-lock on nl_table_lock. Use of RCU relieves the lock contention. Makes use of the new resizable hash table to avoid locking on the lookup. The hash table will grow if entries exceeds 75% of table size up to a total table size of 64K. It will automatically shrink if usage falls below 30%. Also splits nl_table_lock into a separate mutex to protect hash table mutations and allow synchronize_rcu() to sleep while waiting for readers during expansion and shrinking. Before: 9.16% kpktgend_0 [openvswitch] [k] masked_flow_lookup 6.42% kpktgend_0 [pktgen] [k] mod_cur_headers 6.26% kpktgend_0 [pktgen] [k] pktgen_thread_worker 6.23% kpktgend_0 [kernel.kallsyms] [k] memset 4.79% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup 4.37% kpktgend_0 [kernel.kallsyms] [k] memcpy 3.60% kpktgend_0 [openvswitch] [k] ovs_flow_extract 2.69% kpktgend_0 [kernel.kallsyms] [k] jhash2 After: 15.26% kpktgend_0 [openvswitch] [k] masked_flow_lookup 8.12% kpktgend_0 [pktgen] [k] pktgen_thread_worker 7.92% kpktgend_0 [pktgen] [k] mod_cur_headers 5.11% kpktgend_0 [kernel.kallsyms] [k] memset 4.11% kpktgend_0 [openvswitch] [k] ovs_flow_extract 4.06% kpktgend_0 [kernel.kallsyms] [k] _raw_spin_lock 3.90% kpktgend_0 [kernel.kallsyms] [k] jhash2 [...] 0.67% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup Signed-off-by: Thomas Graf <tgraf@suug.ch> Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-02 16:47:45 +07:00
if (!hti) {
hti = kmalloc(sizeof(*hti), GFP_KERNEL);
if (!hti)
return -ENOMEM;
cb->args[2] = (long)hti;
}
if (!s_num)
rhashtable_walk_enter(&tbl->hash, hti);
rhashtable_walk_start(hti);
while ((nlsk = rhashtable_walk_next(hti))) {
if (IS_ERR(nlsk)) {
ret = PTR_ERR(nlsk);
if (ret == -EAGAIN) {
ret = 0;
continue;
}
break;
}
sk = (struct sock *)nlsk;
if (!net_eq(sock_net(sk), net))
continue;
if (sk_diag_fill(sk, skb, req,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
sock_i_ino(sk)) < 0) {
ret = 1;
break;
}
}
rhashtable_walk_stop(hti);
if (ret)
goto done;
rhashtable_walk_exit(hti);
num++;
mc_list:
read_lock(&nl_table_lock);
sk_for_each_bound(sk, &tbl->mc_list) {
if (sk_hashed(sk))
continue;
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num) {
num++;
continue;
}
if (sk_diag_fill(sk, skb, req,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
sock_i_ino(sk)) < 0) {
ret = 1;
break;
}
num++;
}
read_unlock(&nl_table_lock);
done:
cb->args[0] = num;
return ret;
}
static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_diag_req *req;
int s_num = cb->args[0];
int err = 0;
req = nlmsg_data(cb->nlh);
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
for (i = cb->args[1]; i < MAX_LINKS; i++) {
err = __netlink_diag_dump(skb, cb, i, s_num);
if (err)
break;
s_num = 0;
}
cb->args[1] = i;
} else {
netlink: netlink_diag_dump() runs without locks A recent commit removed locking from netlink_diag_dump() but forgot one error case. ===================================== [ BUG: bad unlock balance detected! ] 4.9.0-rc3+ #336 Not tainted ------------------------------------- syz-executor/4018 is trying to release lock ([ 36.220068] nl_table_lock ) at: [<ffffffff82dc8683>] netlink_diag_dump+0x1a3/0x250 net/netlink/diag.c:182 but there are no more locks to release! other info that might help us debug this: 3 locks held by syz-executor/4018: #0: [ 36.220068] ( sock_diag_mutex[ 36.220068] ){+.+.+.} , at: [ 36.220068] [<ffffffff82c3873b>] sock_diag_rcv+0x1b/0x40 #1: [ 36.220068] ( sock_diag_table_mutex[ 36.220068] ){+.+.+.} , at: [ 36.220068] [<ffffffff82c38e00>] sock_diag_rcv_msg+0x140/0x3a0 #2: [ 36.220068] ( nlk->cb_mutex[ 36.220068] ){+.+.+.} , at: [ 36.220068] [<ffffffff82db6600>] netlink_dump+0x50/0xac0 stack backtrace: CPU: 1 PID: 4018 Comm: syz-executor Not tainted 4.9.0-rc3+ #336 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 ffff8800645df688 ffffffff81b46934 ffffffff84eb3e78 ffff88006ad85800 ffffffff82dc8683 ffffffff84eb3e78 ffff8800645df6b8 ffffffff812043ca dffffc0000000000 ffff88006ad85ff8 ffff88006ad85fd0 00000000ffffffff Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [<ffffffff81b46934>] dump_stack+0xb3/0x10f lib/dump_stack.c:51 [<ffffffff812043ca>] print_unlock_imbalance_bug+0x17a/0x1a0 kernel/locking/lockdep.c:3388 [< inline >] __lock_release kernel/locking/lockdep.c:3512 [<ffffffff8120cfd8>] lock_release+0x8e8/0xc60 kernel/locking/lockdep.c:3765 [< inline >] __raw_read_unlock ./include/linux/rwlock_api_smp.h:225 [<ffffffff83fc001a>] _raw_read_unlock+0x1a/0x30 kernel/locking/spinlock.c:255 [<ffffffff82dc8683>] netlink_diag_dump+0x1a3/0x250 net/netlink/diag.c:182 [<ffffffff82db6947>] netlink_dump+0x397/0xac0 net/netlink/af_netlink.c:2110 Fixes: ad202074320c ("netlink: Use rhashtable walk interface in diag dump") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Andrey Konovalov <andreyknvl@google.com> Tested-by: Andrey Konovalov <andreyknvl@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-03 10:21:20 +07:00
if (req->sdiag_protocol >= MAX_LINKS)
return -ENOENT;
err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
}
return err < 0 ? err : skb->len;
}
static int netlink_diag_dump_done(struct netlink_callback *cb)
{
struct rhashtable_iter *hti = (void *)cb->args[2];
if (cb->args[0] == 1)
rhashtable_walk_exit(hti);
kfree(hti);
return 0;
}
static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct netlink_diag_req);
struct net *net = sock_net(skb->sk);
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
if (h->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = netlink_diag_dump,
.done = netlink_diag_dump_done,
};
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
} else
return -EOPNOTSUPP;
}
static const struct sock_diag_handler netlink_diag_handler = {
.family = AF_NETLINK,
.dump = netlink_diag_handler_dump,
};
static int __init netlink_diag_init(void)
{
return sock_diag_register(&netlink_diag_handler);
}
static void __exit netlink_diag_exit(void)
{
sock_diag_unregister(&netlink_diag_handler);
}
module_init(netlink_diag_init);
module_exit(netlink_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);