From f623f75ae443d0c771635d51cc986b9d389bf631 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 00:35:24 +0200 Subject: [PATCH 01/91] rfkill-gpio: include linux/mod_devicetable.h One more driver is apparently broken by the recent change to linux/platform_device.h: net/rfkill/rfkill-gpio.c: In function 'rfkill_gpio_acpi_probe': net/rfkill/rfkill-gpio.c:82:29: error: dereferencing pointer to incomplete type 'const struct acpi_device_id' Include linux/mod_devicetable.h to get the definition of the acpi_device_id structure. Fixes: ac3167257b9f ("headers: separate linux/mod_devicetable.h from linux/platform_device.h") Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 00192a996be0..0f8465852254 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include From 77cfaf52eca5cac30ed029507e0cab065f888995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 13 Aug 2018 14:16:25 +0200 Subject: [PATCH 02/91] mac80211: Run TXQ teardown code before de-registering interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TXQ teardown code can reference the vif data structures that are stored in the netdev private memory area if there are still packets on the queue when it is being freed. Since the TXQ teardown code is run after the netdevs are freed, this can lead to a use-after-free. Fix this by moving the TXQ teardown code to earlier in ieee80211_unregister_hw(). Reported-by: Ben Greear Tested-by: Ben Greear Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fb73451ed85e..0358f20b675f 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1182,6 +1182,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&local->ifa6_notifier); #endif + ieee80211_txq_teardown_flows(local); rtnl_lock(); @@ -1210,7 +1211,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); skb_queue_purge(&local->skb_queue_tdls_chsw); - ieee80211_txq_teardown_flows(local); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); From 484004339d4514fde425f6e8a9f6a6cc979bb0c3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Aug 2018 18:17:03 +0200 Subject: [PATCH 03/91] mac80211_hwsim: require at least one channel Syzbot continues to try to create mac80211_hwsim radios, and manages to pass parameters that are later checked with WARN_ON in cfg80211 - catch another one in hwsim directly. Reported-by: syzbot+2a12f11c306afe871c1f@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 18e819d964f1..fe1b0108f06d 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3194,6 +3194,11 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) if (info->attrs[HWSIM_ATTR_CHANNELS]) param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]); + if (param.channels < 1) { + GENL_SET_ERR_MSG(info, "must have at least one channel"); + return -EINVAL; + } + if (param.channels > CFG80211_MAX_NUM_DIFFERENT_CHANNELS) { GENL_SET_ERR_MSG(info, "too many channels specified"); return -EINVAL; From 8a54d8fc160e67ad485d95a0322ce1221f80770a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Jun 2018 09:29:57 +0200 Subject: [PATCH 04/91] cfg80211: remove division by size of sizeof(struct ieee80211_wmm_rule) Pointer arithmetic already adjusts by the size of the struct, so the sizeof() calculation is wrong. This is basically the same as Colin King's patch for similar code in the iwlwifi driver. Fixes: 230ebaa189af ("cfg80211: read wmm rules from regulatory database") Signed-off-by: Johannes Berg --- net/wireless/reg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4fc66a117b7d..283902974fbf 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -452,8 +452,7 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd) continue; regd->reg_rules[i].wmm_rule = d_wmm + - (src_regd->reg_rules[i].wmm_rule - s_wmm) / - sizeof(struct ieee80211_wmm_rule); + (src_regd->reg_rules[i].wmm_rule - s_wmm); } return regd; } From bab1be79a5169ac748d8292b20c86d874022d7ba Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Aug 2018 18:38:31 +0800 Subject: [PATCH 05/91] sctp: hold transport before accessing its asoc in sctp_transport_get_next As Marcelo noticed, in sctp_transport_get_next, it is iterating over transports but then also accessing the association directly, without checking any refcnts before that, which can cause an use-after-free Read. So fix it by holding transport before accessing the association. With that, sctp_transport_hold calls can be removed in the later places. Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and reuse some for proc") Reported-by: syzbot+fe62a0c9aa6a85c6de16@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/proc.c | 4 ---- net/sctp/socket.c | 22 +++++++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ef5c9a82d4e8..4d6f1c8d6659 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) } transport = (struct sctp_transport *)v; - if (!sctp_transport_hold(transport)) - return 0; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; @@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) } transport = (struct sctp_transport *)v; - if (!sctp_transport_hold(transport)) - return 0; assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e96b15a66aba..aa76586a1a1c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5005,9 +5005,14 @@ struct sctp_transport *sctp_transport_get_next(struct net *net, break; } + if (!sctp_transport_hold(t)) + continue; + if (net_eq(sock_net(t->asoc->base.sk), net) && t->asoc->peer.primary_path == t) break; + + sctp_transport_put(t); } return t; @@ -5017,13 +5022,18 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos) { - void *obj = SEQ_START_TOKEN; + struct sctp_transport *t; - while (pos && (obj = sctp_transport_get_next(net, iter)) && - !IS_ERR(obj)) - pos--; + if (!pos) + return SEQ_START_TOKEN; - return obj; + while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) { + if (!--pos) + break; + sctp_transport_put(t); + } + + return t; } int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), @@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { - if (!sctp_transport_hold(tsp)) - continue; ret = cb(tsp, p); if (ret) break; From 834539e69a5fe2aab33cc777ccfd4a4fcc5b9770 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Aug 2018 18:40:18 +0800 Subject: [PATCH 06/91] sctp: remove useless start_fail from sctp_ht_iter in proc After changing rhashtable_walk_start to return void, start_fail would never be set other value than 0, and the checking for start_fail is pointless, so remove it. Fixes: 97a6ec4ac021 ("rhashtable: Change rhashtable_walk_start to return void") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 4d6f1c8d6659..a644292f9faf 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -215,7 +215,6 @@ static const struct seq_operations sctp_eps_ops = { struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; - int start_fail; }; static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) @@ -224,7 +223,6 @@ static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) sctp_transport_walk_start(&iter->hti); - iter->start_fail = 0; return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); } @@ -232,8 +230,6 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; - if (iter->start_fail) - return; sctp_transport_walk_stop(&iter->hti); } From 84581bdae9587023cea1d139523f0ef0f28bd88d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Aug 2018 18:41:32 +0800 Subject: [PATCH 07/91] erspan: set erspan_ver to 1 by default when adding an erspan dev After erspan_ver is introudced, if erspan_ver is not set in iproute, its value will be left 0 by default. Since Commit 02f99df1875c ("erspan: fix invalid erspan version."), it has broken the traffic due to the version check in erspan_xmit if users are not aware of 'erspan_ver' param, like using an old version of iproute. To fix this compatibility problem, it sets erspan_ver to 1 by default when adding an erspan dev in erspan_setup. Note that we can't do it in ipgre_netlink_parms, as this function is also used by ipgre_changelink. Fixes: 02f99df1875c ("erspan: fix invalid erspan version.") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 3 +++ net/ipv6/ip6_gre.c | 1 + 2 files changed, 4 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 51a5d06085ac..ae714aecc31c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1508,11 +1508,14 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) static void erspan_setup(struct net_device *dev) { + struct ip_tunnel *t = netdev_priv(dev); + ether_setup(dev); dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, erspan_net_id); + t->erspan_ver = 1; } static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 18a3794b0f52..e493b041d4ac 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1778,6 +1778,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_COLLECT_METADATA]) parms->collect_md = true; + parms->erspan_ver = 1; if (data[IFLA_GRE_ERSPAN_VER]) parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); From d5ed72a55bc0a321ef33d272e2b0bf0d2b06d1fe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Aug 2018 20:58:43 +0200 Subject: [PATCH 08/91] net: sched: fix extack error message when chain is failed to be created Instead "Cannot find" say "Cannot create". Fixes: c35a4acc2985 ("net: sched: cls_api: handle generic cls errors") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 31bd1439cf60..2d41c5b21b48 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1252,7 +1252,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, } chain = tcf_chain_get(block, chain_index, true); if (!chain) { - NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + NL_SET_ERR_MSG(extack, "Cannot create specified filter chain"); err = -ENOMEM; goto errout; } From b7b4247d553939ccf02ff597ec60f41a2f93ee8e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Aug 2018 20:58:44 +0200 Subject: [PATCH 09/91] net: sched: return -ENOENT when trying to remove filter from non-existent chain When chain 0 was implicitly created, removal of non-existent filter from chain 0 gave -ENOENT. Once chain 0 became non-implicit, the same call is giving -EINVAL. Fix this by returning -ENOENT in that case. Reported-by: Roman Mashak Fixes: f71e0ca4db18 ("net: sched: Avoid implicit chain 0 creation") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2d41c5b21b48..1a67af8a6e8c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1399,7 +1399,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - err = -EINVAL; + err = -ENOENT; goto errout; } From 30935198b7d0be12b1c45c328b66a7fdefb16256 Mon Sep 17 00:00:00 2001 From: Haiqing Bai Date: Mon, 27 Aug 2018 09:32:26 +0800 Subject: [PATCH 10/91] tipc: fix the big/little endian issue in tipc_dest In function tipc_dest_push, the 32bit variables 'node' and 'port' are stored separately in uppper and lower part of 64bit 'value'. Then this value is assigned to dst->value which is a union like: union { struct { u32 port; u32 node; }; u64 value; } This works on little-endian machines like x86 but fails on big-endian machines. The fix remove the 'value' stack parameter and even the 'value' member of the union in tipc_dest, assign the 'node' and 'port' member directly with the input parameter to avoid the endian issue. Fixes: a80ae5306a73 ("tipc: improve destination linked list") Signed-off-by: Zhenbo Gao Acked-by: Jon Maloy Signed-off-by: Haiqing Bai Signed-off-by: David S. Miller --- net/tipc/name_table.c | 10 ++++------ net/tipc/name_table.h | 9 ++------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 88f027b502f6..66d5b2c5987a 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -980,20 +980,17 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { - u64 value = (u64)node << 32 | port; struct tipc_dest *dst; list_for_each_entry(dst, l, list) { - if (dst->value != value) - continue; - return dst; + if (dst->node == node && dst->port == port) + return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { - u64 value = (u64)node << 32 | port; struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) @@ -1002,7 +999,8 @@ bool tipc_dest_push(struct list_head *l, u32 node, u32 port) dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; - dst->value = value; + dst->node = node; + dst->port = port; list_add(&dst->list, l); return true; } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 0febba41da86..892bd750b85f 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -133,13 +133,8 @@ void tipc_nametbl_stop(struct net *net); struct tipc_dest { struct list_head list; - union { - struct { - u32 port; - u32 node; - }; - u64 value; - }; + u32 port; + u32 node; }; struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port); From ad8619864f0c9bd89e14d957afa3fd8aaf0720da Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 27 Aug 2018 00:20:11 +0200 Subject: [PATCH 11/91] net: dsa: Drop GPIO includes Commit 52638f71fcff ("dsa: Move gpio reset into switch driver") moved the GPIO handling into the switch drivers but forgot to remove the GPIO header includes. Signed-off-by: Linus Walleij Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e63c554e0623..9f3209ff7ffd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -19,12 +19,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include "dsa_priv.h" From 53ae914d898e5dd5984d352d5fa0b23410f966a0 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 25 Aug 2018 15:19:05 +0800 Subject: [PATCH 12/91] net/rds: Use rdma_read_gids to get connection SGID/DGID in IPv6 In IPv4, the newly introduced rdma_read_gids is used to read the SGID/DGID for the connection which returns GID correctly for RoCE transport as well. In IPv6, rdma_read_gids is also used. The following are why rdma_read_gids is introduced. rdma_addr_get_dgid() for RoCE for client side connections returns MAC address, instead of DGID. rdma_addr_get_sgid() for RoCE doesn't return correct SGID for IPv6 and when more than one IP address is assigned to the netdevice. So the transport agnostic rdma_read_gids() API is provided by rdma_cm module. Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index c1d97640c0be..eba75c1ba359 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -341,15 +341,10 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - struct rdma_dev_addr *dev_addr; ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - rdma_addr_get_sgid(dev_addr, - (union ib_gid *)&iinfo6->src_gid); - rdma_addr_get_dgid(dev_addr, - (union ib_gid *)&iinfo6->dst_gid); - + rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, + (union ib_gid *)&iinfo6->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo6->max_send_wr = ic->i_send_ring.w_nr; iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; From e06fa9c16ce4b740996189fa5610eabcee734e6c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:50 +0200 Subject: [PATCH 13/91] bpf, sockmap: fix potential use after free in bpf_tcp_close bpf_tcp_close() we pop the psock linkage to a map via psock_map_pop(). A parallel update on the sock hash map can happen between psock_map_pop() and lookup_elem_raw() where we override the element under link->hash / link->key. In bpf_tcp_close()'s lookup_elem_raw() we subsequently only test whether an element is present, but we do not test whether the element is infact the element we were looking for. We lock the sock in bpf_tcp_close() during that time, so do we hold the lock in sock_hash_update_elem(). However, the latter locks the sock which is newly updated, not the one we're purging from the hash table. This means that while one CPU is doing the lookup from bpf_tcp_close(), another CPU is doing the map update in parallel, dropped our sock from the hlist and released the psock. Subsequently the first CPU will find the new sock and attempts to drop and release the old sock yet another time. Fix is that we need to check the elements for a match after lookup, similar as we do in the sock map. Note that the hash tab elems are freed via RCU, so access to their link->hash / link->key is fine since we're under RCU read side there. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..01879e4d599a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); From 15c480efab01197c965ce0562a43ffedd852b8f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:51 +0200 Subject: [PATCH 14/91] bpf, sockmap: fix psock refcount leak in bpf_tcp_recvmsg In bpf_tcp_recvmsg() we first took a reference on the psock, however once we find that there are skbs in the normal socket's receive queue we return with processing them through tcp_recvmsg(). Problem is that we leak the taken reference on the psock in that path. Given we don't really do anything with the psock at this point, move the skb_queue_empty() test before we fetch the psock to fix this case. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 01879e4d599a..26d8a3053407 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -912,6 +912,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +924,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { From 3f6e138d41ddff196f452993528cfe75762ede0f Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 27 Aug 2018 21:30:42 +0200 Subject: [PATCH 15/91] bpf: fix build error with clang Building the newly introduced BPF_PROG_TYPE_SK_REUSEPORT leads to a compile time error when building with clang: net/core/filter.o: In function `sk_reuseport_convert_ctx_access': ../net/core/filter.c:7284: undefined reference to `__compiletime_assert_7284' It seems that clang has issues resolving hweight_long at compile time. Since SK_FL_PROTO_MASK is a constant, we can use the interface for known constant arguments which works fine with clang. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Stefan Agner Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index c25eb36f1320..7a2430945c71 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7281,7 +7281,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); From 501ca81760c204ec59b73e4a00bee5971fc0f1b1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 24 Aug 2018 17:37:00 -0700 Subject: [PATCH 16/91] bpf: sockmap, decrement copied count correctly in redirect error case Currently, when a redirect occurs in sockmap and an error occurs in the redirect call we unwind the scatterlist once in the error path of bpf_tcp_sendmsg_do_redirect() and then again in sendmsg(). Then in the error path of sendmsg we decrement the copied count by the send size. However, its possible we partially sent data before the error was generated. This can happen if do_tcp_sendpages() partially sends the scatterlist before encountering a memory pressure error. If this happens we need to decrement the copied value (the value tracking how many bytes were actually sent to TCP stack) by the number of remaining bytes _not_ the entire send size. Otherwise we risk confusing userspace. Also we don't need two calls to free the scatterlist one is good enough. So remove the one in bpf_tcp_sendmsg_do_redirect() and then properly reduce copied by the number of remaining bytes which may in fact be the entire send size if no bytes were sent. To do this use bool to indicate if free_start_sg() should do mem accounting or not. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 45 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 26d8a3053407..ce63e5801746 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +820,7 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock, case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +843,17 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock, lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -1121,7 +1120,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1580,13 +1579,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } From 67d1ba8a6dc83d90cd58b89fa6cbf9ae35a0cf7f Mon Sep 17 00:00:00 2001 From: Danek Duvall Date: Wed, 22 Aug 2018 16:01:04 -0700 Subject: [PATCH 17/91] mac80211: correct use of IEEE80211_VHT_CAP_RXSTBC_X The mod mask for VHT capabilities intends to say that you can override the number of STBC receive streams, and it does, but only by accident. The IEEE80211_VHT_CAP_RXSTBC_X aren't bits to be set, but values (albeit left-shifted). ORing the bits together gets the right answer, but we should use the _MASK macro here instead. Signed-off-by: Danek Duvall Signed-off-by: Johannes Berg --- net/mac80211/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0358f20b675f..27cd64acaf00 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -470,10 +470,7 @@ static const struct ieee80211_vht_cap mac80211_vht_capa_mod_mask = { cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | - IEEE80211_VHT_CAP_RXSTBC_1 | - IEEE80211_VHT_CAP_RXSTBC_2 | - IEEE80211_VHT_CAP_RXSTBC_3 | - IEEE80211_VHT_CAP_RXSTBC_4 | + IEEE80211_VHT_CAP_RXSTBC_MASK | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | From d7c863a2f65e48f442379f4ee1846d52e0c5d24d Mon Sep 17 00:00:00 2001 From: Danek Duvall Date: Wed, 22 Aug 2018 16:01:05 -0700 Subject: [PATCH 18/91] mac80211_hwsim: correct use of IEEE80211_VHT_CAP_RXSTBC_X The mac80211_hwsim driver intends to say that it supports up to four STBC receive streams, but instead it ends up saying something undefined. The IEEE80211_VHT_CAP_RXSTBC_X macros aren't independent bits that can be ORed together, but values. In this case, _4 is the appropriate one to use. Signed-off-by: Danek Duvall Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index fe1b0108f06d..7d0b460868f9 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2699,9 +2699,6 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | - IEEE80211_VHT_CAP_RXSTBC_1 | - IEEE80211_VHT_CAP_RXSTBC_2 | - IEEE80211_VHT_CAP_RXSTBC_3 | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; sband->vht_cap.vht_mcs.rx_mcs_map = From 38cb87ee47fb825f6c9d645c019f75b3905c0ab2 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 22 Aug 2018 13:52:21 +0200 Subject: [PATCH 19/91] cfg80211: make wmm_rule part of the reg_rule structure Make wmm_rule be part of the reg_rule structure. This simplifies the code a lot at the cost of having bigger memory usage. However in most cases we have only few reg_rule's and when we do have many like in iwlwifi we do not save memory as it allocates a separate wmm_rule for each channel anyway. This also fixes a bug reported in various places where somewhere the pointers were corrupted and we ended up doing a null-dereference. Fixes: 230ebaa189af ("cfg80211: read wmm rules from regulatory database") Signed-off-by: Stanislaw Gruszka [rephrase commit message slightly] Signed-off-by: Johannes Berg --- .../wireless/intel/iwlwifi/iwl-nvm-parse.c | 50 ++--------- include/net/cfg80211.h | 4 +- include/net/regulatory.h | 4 +- net/mac80211/util.c | 8 +- net/wireless/nl80211.c | 10 +-- net/wireless/reg.c | 90 +++---------------- 6 files changed, 31 insertions(+), 135 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index b815ba38dbdb..88121548eb9f 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -877,15 +877,12 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, const u8 *nvm_chan = cfg->nvm_type == IWL_NVM_EXT ? iwl_ext_nvm_channels : iwl_nvm_channels; struct ieee80211_regdomain *regd, *copy_rd; - int size_of_regd, regd_to_copy, wmms_to_copy; - int size_of_wmms = 0; + int size_of_regd, regd_to_copy; struct ieee80211_reg_rule *rule; - struct ieee80211_wmm_rule *wmm_rule, *d_wmm, *s_wmm; struct regdb_ptrs *regdb_ptrs; enum nl80211_band band; int center_freq, prev_center_freq = 0; - int valid_rules = 0, n_wmms = 0; - int i; + int valid_rules = 0; bool new_rule; int max_num_ch = cfg->nvm_type == IWL_NVM_EXT ? IWL_NVM_NUM_CHANNELS_EXT : IWL_NVM_NUM_CHANNELS; @@ -904,11 +901,7 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, sizeof(struct ieee80211_regdomain) + num_of_ch * sizeof(struct ieee80211_reg_rule); - if (geo_info & GEO_WMM_ETSI_5GHZ_INFO) - size_of_wmms = - num_of_ch * sizeof(struct ieee80211_wmm_rule); - - regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL); + regd = kzalloc(size_of_regd, GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); @@ -922,8 +915,6 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, regd->alpha2[0] = fw_mcc >> 8; regd->alpha2[1] = fw_mcc & 0xff; - wmm_rule = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd); - for (ch_idx = 0; ch_idx < num_of_ch; ch_idx++) { ch_flags = (u16)__le32_to_cpup(channels + ch_idx); band = (ch_idx < NUM_2GHZ_CHANNELS) ? @@ -977,26 +968,10 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, band == NL80211_BAND_2GHZ) continue; - if (!reg_query_regdb_wmm(regd->alpha2, center_freq, - ®db_ptrs[n_wmms].token, wmm_rule)) { - /* Add only new rules */ - for (i = 0; i < n_wmms; i++) { - if (regdb_ptrs[i].token == - regdb_ptrs[n_wmms].token) { - rule->wmm_rule = regdb_ptrs[i].rule; - break; - } - } - if (i == n_wmms) { - rule->wmm_rule = wmm_rule; - regdb_ptrs[n_wmms++].rule = wmm_rule; - wmm_rule++; - } - } + reg_query_regdb_wmm(regd->alpha2, center_freq, rule); } regd->n_reg_rules = valid_rules; - regd->n_wmm_rules = n_wmms; /* * Narrow down regdom for unused regulatory rules to prevent hole @@ -1005,28 +980,13 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, regd_to_copy = sizeof(struct ieee80211_regdomain) + valid_rules * sizeof(struct ieee80211_reg_rule); - wmms_to_copy = sizeof(struct ieee80211_wmm_rule) * n_wmms; - - copy_rd = kzalloc(regd_to_copy + wmms_to_copy, GFP_KERNEL); + copy_rd = kzalloc(regd_to_copy, GFP_KERNEL); if (!copy_rd) { copy_rd = ERR_PTR(-ENOMEM); goto out; } memcpy(copy_rd, regd, regd_to_copy); - memcpy((u8 *)copy_rd + regd_to_copy, (u8 *)regd + size_of_regd, - wmms_to_copy); - - d_wmm = (struct ieee80211_wmm_rule *)((u8 *)copy_rd + regd_to_copy); - s_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd); - - for (i = 0; i < regd->n_reg_rules; i++) { - if (!regd->reg_rules[i].wmm_rule) - continue; - - copy_rd->reg_rules[i].wmm_rule = d_wmm + - (regd->reg_rules[i].wmm_rule - s_wmm); - } out: kfree(regdb_ptrs); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1beb3ead0385..7229c186d199 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4763,8 +4763,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * * Return: 0 on success. -ENODATA. */ -int reg_query_regdb_wmm(char *alpha2, int freq, u32 *ptr, - struct ieee80211_wmm_rule *rule); +int reg_query_regdb_wmm(char *alpha2, int freq, + struct ieee80211_reg_rule *rule); /* * callbacks for asynchronous cfg80211 methods, notification diff --git a/include/net/regulatory.h b/include/net/regulatory.h index 60f8cc86a447..3469750df0f4 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -217,15 +217,15 @@ struct ieee80211_wmm_rule { struct ieee80211_reg_rule { struct ieee80211_freq_range freq_range; struct ieee80211_power_rule power_rule; - struct ieee80211_wmm_rule *wmm_rule; + struct ieee80211_wmm_rule wmm_rule; u32 flags; u32 dfs_cac_ms; + bool has_wmm; }; struct ieee80211_regdomain { struct rcu_head rcu_head; u32 n_reg_rules; - u32 n_wmm_rules; char alpha2[3]; enum nl80211_dfs_regions dfs_region; struct ieee80211_reg_rule reg_rules[]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d02fbfec3783..c80187d6e6bb 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1120,7 +1120,7 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; - struct ieee80211_wmm_ac *wmm_ac; + const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && @@ -1139,15 +1139,15 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); - if (IS_ERR_OR_NULL(rrule) || !rrule->wmm_rule) { + if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) - wmm_ac = &rrule->wmm_rule->ap[ac]; + wmm_ac = &rrule->wmm_rule.ap[ac]; else - wmm_ac = &rrule->wmm_rule->client[ac]; + wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 80bc986c79e5..e3dcffd96919 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -667,13 +667,13 @@ static int nl80211_msg_put_wmm_rules(struct sk_buff *msg, goto nla_put_failure; if (nla_put_u16(msg, NL80211_WMMR_CW_MIN, - rule->wmm_rule->client[j].cw_min) || + rule->wmm_rule.client[j].cw_min) || nla_put_u16(msg, NL80211_WMMR_CW_MAX, - rule->wmm_rule->client[j].cw_max) || + rule->wmm_rule.client[j].cw_max) || nla_put_u8(msg, NL80211_WMMR_AIFSN, - rule->wmm_rule->client[j].aifsn) || + rule->wmm_rule.client[j].aifsn) || nla_put_u8(msg, NL80211_WMMR_TXOP, - rule->wmm_rule->client[j].cot)) + rule->wmm_rule.client[j].cot)) goto nla_put_failure; nla_nest_end(msg, nl_wmm_rule); @@ -766,7 +766,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, const struct ieee80211_reg_rule *rule = freq_reg_info(wiphy, chan->center_freq); - if (!IS_ERR(rule) && rule->wmm_rule) { + if (!IS_ERR_OR_NULL(rule) && rule->has_wmm) { if (nl80211_msg_put_wmm_rules(msg, rule)) goto nla_put_failure; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 283902974fbf..2f702adf2912 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -425,35 +425,23 @@ static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; - int size_of_regd, size_of_wmms; + int size_of_regd; unsigned int i; - struct ieee80211_wmm_rule *d_wmm, *s_wmm; size_of_regd = sizeof(struct ieee80211_regdomain) + src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule); - size_of_wmms = src_regd->n_wmm_rules * - sizeof(struct ieee80211_wmm_rule); - regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL); + regd = kzalloc(size_of_regd, GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); - d_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd); - s_wmm = (struct ieee80211_wmm_rule *)((u8 *)src_regd + size_of_regd); - memcpy(d_wmm, s_wmm, size_of_wmms); - - for (i = 0; i < src_regd->n_reg_rules; i++) { + for (i = 0; i < src_regd->n_reg_rules; i++) memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], sizeof(struct ieee80211_reg_rule)); - if (!src_regd->reg_rules[i].wmm_rule) - continue; - regd->reg_rules[i].wmm_rule = d_wmm + - (src_regd->reg_rules[i].wmm_rule - s_wmm); - } return regd; } @@ -859,9 +847,10 @@ static bool valid_regdb(const u8 *data, unsigned int size) return true; } -static void set_wmm_rule(struct ieee80211_wmm_rule *rule, +static void set_wmm_rule(struct ieee80211_reg_rule *rrule, struct fwdb_wmm_rule *wmm) { + struct ieee80211_wmm_rule *rule = &rrule->wmm_rule; unsigned int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { @@ -875,11 +864,13 @@ static void set_wmm_rule(struct ieee80211_wmm_rule *rule, rule->ap[i].aifsn = wmm->ap[i].aifsn; rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); } + + rrule->has_wmm = true; } static int __regdb_query_wmm(const struct fwdb_header *db, const struct fwdb_country *country, int freq, - u32 *dbptr, struct ieee80211_wmm_rule *rule) + struct ieee80211_reg_rule *rule) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); @@ -900,8 +891,6 @@ static int __regdb_query_wmm(const struct fwdb_header *db, wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2; wmm = (void *)((u8 *)db + wmm_ptr); set_wmm_rule(rule, wmm); - if (dbptr) - *dbptr = wmm_ptr; return 0; } } @@ -909,8 +898,7 @@ static int __regdb_query_wmm(const struct fwdb_header *db, return -ENODATA; } -int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr, - struct ieee80211_wmm_rule *rule) +int reg_query_regdb_wmm(char *alpha2, int freq, struct ieee80211_reg_rule *rule) { const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; @@ -924,8 +912,7 @@ int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr, country = &hdr->country[0]; while (country->coll_ptr) { if (alpha2_equal(alpha2, country->alpha2)) - return __regdb_query_wmm(regdb, country, freq, dbptr, - rule); + return __regdb_query_wmm(regdb, country, freq, rule); country++; } @@ -934,32 +921,13 @@ int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr, } EXPORT_SYMBOL(reg_query_regdb_wmm); -struct wmm_ptrs { - struct ieee80211_wmm_rule *rule; - u32 ptr; -}; - -static struct ieee80211_wmm_rule *find_wmm_ptr(struct wmm_ptrs *wmm_ptrs, - u32 wmm_ptr, int n_wmms) -{ - int i; - - for (i = 0; i < n_wmms; i++) { - if (wmm_ptrs[i].ptr == wmm_ptr) - return wmm_ptrs[i].rule; - } - return NULL; -} - static int regdb_query_country(const struct fwdb_header *db, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; - struct ieee80211_regdomain *tmp_rd; - unsigned int size_of_regd, i, n_wmms = 0; - struct wmm_ptrs *wmm_ptrs; + unsigned int size_of_regd, i; size_of_regd = sizeof(struct ieee80211_regdomain) + coll->n_rules * sizeof(struct ieee80211_reg_rule); @@ -968,12 +936,6 @@ static int regdb_query_country(const struct fwdb_header *db, if (!regdom) return -ENOMEM; - wmm_ptrs = kcalloc(coll->n_rules, sizeof(*wmm_ptrs), GFP_KERNEL); - if (!wmm_ptrs) { - kfree(regdom); - return -ENOMEM; - } - regdom->n_reg_rules = coll->n_rules; regdom->alpha2[0] = country->alpha2[0]; regdom->alpha2[1] = country->alpha2[1]; @@ -1012,37 +974,11 @@ static int regdb_query_country(const struct fwdb_header *db, 1000 * be16_to_cpu(rule->cac_timeout); if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; - struct ieee80211_wmm_rule *wmm_pos = - find_wmm_ptr(wmm_ptrs, wmm_ptr, n_wmms); - struct fwdb_wmm_rule *wmm; - struct ieee80211_wmm_rule *wmm_rule; + struct fwdb_wmm_rule *wmm = (void *)((u8 *)db + wmm_ptr); - if (wmm_pos) { - rrule->wmm_rule = wmm_pos; - continue; - } - wmm = (void *)((u8 *)db + wmm_ptr); - tmp_rd = krealloc(regdom, size_of_regd + (n_wmms + 1) * - sizeof(struct ieee80211_wmm_rule), - GFP_KERNEL); - - if (!tmp_rd) { - kfree(regdom); - kfree(wmm_ptrs); - return -ENOMEM; - } - regdom = tmp_rd; - - wmm_rule = (struct ieee80211_wmm_rule *) - ((u8 *)regdom + size_of_regd + n_wmms * - sizeof(struct ieee80211_wmm_rule)); - - set_wmm_rule(wmm_rule, wmm); - wmm_ptrs[n_wmms].ptr = wmm_ptr; - wmm_ptrs[n_wmms++].rule = wmm_rule; + set_wmm_rule(rrule, wmm); } } - kfree(wmm_ptrs); return reg_schedule_apply(regdom); } From 20932750d9c78d307e4f2f18f9c6a32b82b1e0e8 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Aug 2018 13:56:07 +0300 Subject: [PATCH 20/91] mac80211: don't update the PM state of a peer upon a multicast frame I changed the way mac80211 updates the PM state of the peer. I forgot that we could also have multicast frames from the peer and that those frame should of course not change the PM state of the peer: A peer goes to power save when it needs to scan, but it won't send the broadcast Probe Request with the PM bit set. This made us mark the peer as awake when it wasn't and then Intel's firmware would fail to transmit because the peer is asleep according to its database. The driver warned about this and it looked like this: WARNING: CPU: 0 PID: 184 at /usr/src/linux-4.16.14/drivers/net/wireless/intel/iwlwifi/mvm/tx.c:1369 iwl_mvm_rx_tx_cmd+0x53b/0x860 CPU: 0 PID: 184 Comm: irq/124-iwlwifi Not tainted 4.16.14 #1 RIP: 0010:iwl_mvm_rx_tx_cmd+0x53b/0x860 Call Trace: iwl_pcie_rx_handle+0x220/0x880 iwl_pcie_irq_handler+0x6c9/0xa20 ? irq_forced_thread_fn+0x60/0x60 ? irq_thread_dtor+0x90/0x90 The relevant code that spits the WARNING is: case TX_STATUS_FAIL_DEST_PS: /* the FW should have stopped the queue and not * return this status */ WARN_ON(1); info->flags |= IEEE80211_TX_STAT_TX_FILTERED; This fixes https://bugzilla.kernel.org/show_bug.cgi?id=199967. Fixes: 9fef65443388 ("mac80211: always update the PM state of a peer on MGMT / DATA frames") Cc: #4.16+ Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 932985ca4e66..3f80a5ca4050 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1612,6 +1612,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) */ if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) && !ieee80211_has_morefrags(hdr->frame_control) && + !is_multicast_ether_addr(hdr->addr1) && (ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_data(hdr->frame_control)) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && From 3a2af7cccbbaf2362db9053a946a6084e12bfa73 Mon Sep 17 00:00:00 2001 From: Jinbum Park Date: Tue, 31 Jul 2018 23:10:40 +0900 Subject: [PATCH 21/91] mac80211_hwsim: Fix possible Spectre-v1 for hwsim_world_regdom_custom User controls @idx which to be used as index of hwsim_world_regdom_custom. So, It can be exploited via Spectre-like attack. (speculative execution) This kind of attack leaks address of hwsim_world_regdom_custom, It leads an attacker to bypass security mechanism such as KASLR. So sanitize @idx before using it to prevent attack. I leveraged strategy [1] to find and exploit this gadget. [1] https://github.com/jinb-park/linux-exploit/tree/master/exploit-remaining-spectre-gadget/ Signed-off-by: Jinbum Park [johannes: unwrap URL] Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 7d0b460868f9..80e2c8595c7c 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "mac80211_hwsim.h" #define WARN_QUEUE 100 @@ -3229,6 +3230,9 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) kfree(hwname); return -EINVAL; } + + idx = array_index_nospec(idx, + ARRAY_SIZE(hwsim_world_regdom_custom)); param.regd = hwsim_world_regdom_custom[idx]; } From d3c89bbc7491d5e288ca2993e999d24ba9ff52ad Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Tue, 21 Aug 2018 09:22:19 +0300 Subject: [PATCH 22/91] nl80211: Fix nla_put_u8 to u16 for NL80211_WMMR_TXOP TXOP (also known as Channel Occupancy Time) is u16 and should be added using nla_put_u16 instead of u8, fix that. Fixes: 50f32718e125 ("nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command") Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e3dcffd96919..3f7ffbe6c634 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -672,8 +672,8 @@ static int nl80211_msg_put_wmm_rules(struct sk_buff *msg, rule->wmm_rule.client[j].cw_max) || nla_put_u8(msg, NL80211_WMMR_AIFSN, rule->wmm_rule.client[j].aifsn) || - nla_put_u8(msg, NL80211_WMMR_TXOP, - rule->wmm_rule.client[j].cot)) + nla_put_u16(msg, NL80211_WMMR_TXOP, + rule->wmm_rule.client[j].cot)) goto nla_put_failure; nla_nest_end(msg, nl_wmm_rule); From b88d26d97c41680f7327e5fb8061ad0037877f40 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Tue, 21 Aug 2018 09:22:20 +0300 Subject: [PATCH 23/91] nl80211: Pass center frequency in kHz instead of MHz freq_reg_info expects to get the frequency in kHz. Instead we accidently pass it in MHz. Thus, currently the function always return ERR rule. Fix that. Fixes: 50f32718e125 ("nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command") Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho [fix kHz/MHz in commit message] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3f7ffbe6c634..ce0149a86c13 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -764,7 +764,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if (large) { const struct ieee80211_reg_rule *rule = - freq_reg_info(wiphy, chan->center_freq); + freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq)); if (!IS_ERR_OR_NULL(rule) && rule->has_wmm) { if (nl80211_msg_put_wmm_rules(msg, rule)) From 5b24109b0563d45094c470684c1f8cea1af269f8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Aug 2018 16:15:35 +0200 Subject: [PATCH 24/91] bpf: fix several offset tests in bpf_msg_pull_data While recently going over bpf_msg_pull_data(), I noticed three issues which are fixed in here: 1) When we attempt to find the first scatterlist element (sge) for the start offset, we add len to the offset before we check for start < offset + len, whereas it should come after when we iterate to the next sge to accumulate the offsets. For example, given a start offset of 12 with a sge length of 8 for the first sge in the list would lead us to determine this sge as the first sge thinking it covers first 16 bytes where start is located, whereas start sits in subsequent sges so we would end up pulling in the wrong data. 2) After figuring out the starting sge, we have a short-cut test in !msg->sg_copy[i] && bytes <= len. This checks whether it's not needed to make the page at the sge private where we can just exit by updating msg->data and msg->data_end. However, the length test is not fully correct. bytes <= len checks whether the requested bytes (end - start offsets) fit into the sge's length. The part that is missing is that start must not be sge length aligned. Meaning, the start offset into the sge needs to be accounted as well on top of the requested bytes as otherwise we can access the sge out of bounds. For example the sge could have length of 8, our requested bytes could have length of 8, but at a start offset of 4, so we also would need to pull in 4 bytes of the next sge, when we jump to the out label we do set msg->data to sg_virt(&sg[i]) + start - offset and msg->data_end to msg->data + bytes which would be oob. 3) The subsequent bytes < copy test for finding the last sge has the same issue as in point 2) but also it tests for less than rather than less or equal to. Meaning if the sge length is of 8 and requested bytes of 8 while having the start aligned with the sge, we would unnecessarily go and pull in the next sge as well to make it private. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7a2430945c71..ec4d67c0cf0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2286,10 +2286,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { unsigned int len = 0, offset = 0, copy = 0; + int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; unsigned char *p, *to, *from; - int bytes = end - start; struct page *page; if (unlikely(flags || end <= start)) @@ -2299,9 +2299,9 @@ BPF_CALL_4(bpf_msg_pull_data, i = msg->sg_start; do { len = sg[i].length; - offset += len; if (start < offset + len) break; + offset += len; i++; if (i == MAX_SKB_FRAGS) i = 0; @@ -2310,7 +2310,11 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; - if (!msg->sg_copy[i] && bytes <= len) + /* The start may point into the sg element so we need to also + * account for the headroom. + */ + bytes_sg_total = start - offset + bytes; + if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; first_sg = i; @@ -2330,12 +2334,12 @@ BPF_CALL_4(bpf_msg_pull_data, i++; if (i == MAX_SKB_FRAGS) i = 0; - if (bytes < copy) + if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); last_sg = i; - if (unlikely(copy < end - start)) + if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); From 1f631c3201fe5491808df143d8fcba81b3197ffd Mon Sep 17 00:00:00 2001 From: Yuan-Chi Pang Date: Wed, 29 Aug 2018 09:30:08 +0800 Subject: [PATCH 25/91] mac80211: mesh: fix HWMP sequence numbering to follow standard IEEE 802.11-2016 14.10.8.3 HWMP sequence numbering says: If it is a target mesh STA, it shall update its own HWMP SN to maximum (current HWMP SN, target HWMP SN in the PREQ element) + 1 immediately before it generates a PREP element in response to a PREQ element. Signed-off-by: Yuan-Chi Pang Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 35ad3983ae4b..daf9db3c8f24 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -572,6 +572,10 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, forward = false; reply = true; target_metric = 0; + + if (SN_GT(target_sn, ifmsh->sn)) + ifmsh->sn = target_sn; + if (time_after(jiffies, ifmsh->last_sn_update + net_traversal_jiffies(sdata)) || time_before(jiffies, ifmsh->last_sn_update)) { From 166ac9d55b0ab70b644e429be1f217fe8393cbd7 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Wed, 29 Aug 2018 08:57:02 +0200 Subject: [PATCH 26/91] mac80211: avoid kernel panic when building AMSDU from non-linear SKB When building building AMSDU from non-linear SKB, we hit a kernel panic when trying to push the padding to the tail. Instead, put the padding at the head of the next subframe. This also fixes the A-MSDU subframes to not have the padding accounted in the length field and not have pad at all for the last subframe, both required by the spec. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Sara Sharon Reviewed-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index fa1f1e63a264..667a73d6eb5c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3073,27 +3073,18 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta) } static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local, - struct sk_buff *skb, int headroom, - int *subframe_len) + struct sk_buff *skb, int headroom) { - int amsdu_len = *subframe_len + sizeof(struct ethhdr); - int padding = (4 - amsdu_len) & 3; - - if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) { + if (skb_headroom(skb) < headroom) { I802_DEBUG_INC(local->tx_expand_skb_head); - if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) { + if (pskb_expand_head(skb, headroom, 0, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return false; } } - if (padding) { - *subframe_len += padding; - skb_put_zero(skb, padding); - } - return true; } @@ -3117,8 +3108,7 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; - if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr), - &subframe_len)) + if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr))) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); @@ -3184,7 +3174,8 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, void *data; bool ret = false; unsigned int orig_len; - int n = 1, nfrags; + int n = 1, nfrags, pad = 0; + u16 hdrlen; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; @@ -3235,8 +3226,19 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (max_frags && nfrags > max_frags) goto out; - if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2, - &subframe_len)) + /* + * Pad out the previous subframe to a multiple of 4 by adding the + * padding to the next one, that's being added. Note that head->len + * is the length of the full A-MSDU, but that works since each time + * we add a new subframe we pad out the previous one to a multiple + * of 4 and thus it no longer matters in the next round. + */ + hdrlen = fast_tx->hdr_len - sizeof(rfc1042_header); + if ((head->len - hdrlen) & 3) + pad = 4 - ((head->len - hdrlen) & 3); + + if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + + 2 + pad)) goto out; ret = true; @@ -3248,6 +3250,8 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, memcpy(data, &len, 2); memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header)); + memset(skb_push(skb, pad), 0, pad); + head->len += skb->len; head->data_len += skb->len; *frag_tail = skb; From 0e06b227c5221dd51b5569de93f3b9f532be4a32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:34 +0200 Subject: [PATCH 27/91] bpf: fix msg->data/data_end after sg shift repair in bpf_msg_pull_data In the current code, msg->data is set as sg_virt(&sg[i]) + start - offset and msg->data_end relative to it as msg->data + bytes. Using iterator i to point to the updated starting scatterlist element holds true for some cases, however not for all where we'd end up pointing out of bounds. It is /correct/ for these ones: 1) When first finding the starting scatterlist element (sge) where we find that the page is already privately owned by the msg and where the requested bytes and headroom fit into the sge's length. However, it's /incorrect/ for the following ones: 2) After we made the requested area private and updated the newly allocated page into first_sg slot of the scatterlist ring; when we find that no shift repair of the ring is needed where we bail out updating msg->data and msg->data_end. At that point i will point to last_sg, which in this case is the next elem of first_sg in the ring. The sge at that point might as well be invalid (e.g. i == msg->sg_end), which we use for setting the range of sg_virt(&sg[i]). The correct one would have been first_sg. 3) Similar as in 2) but when we find that a shift repair of the ring is needed. In this case we fix up all sges and stop once we've reached the end. In this case i will point to will point to the new msg->sg_end, and the sge at that point will be invalid. Again here the requested range sits in first_sg. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ec4d67c0cf0c..b9225c55926a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2310,6 +2310,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; + first_sg = i; /* The start may point into the sg element so we need to also * account for the headroom. */ @@ -2317,8 +2318,6 @@ BPF_CALL_4(bpf_msg_pull_data, if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; - first_sg = i; - /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then @@ -2400,7 +2399,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (msg->sg_end < 0) msg->sg_end += MAX_SKB_FRAGS; out: - msg->data = sg_virt(&sg[i]) + start - offset; + msg->data = sg_virt(&sg[first_sg]) + start - offset; msg->data_end = msg->data + bytes; return 0; From 2e43f95dd8ee62bc8bf57f2afac37fbd70c8d565 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:35 +0200 Subject: [PATCH 28/91] bpf: fix shift upon scatterlist ring wrap-around in bpf_msg_pull_data If first_sg and last_sg wraps around in the scatterlist ring, then we need to account for that in the shift as well. E.g. crafting such msgs where this is the case leads to a hang as shift becomes negative. E.g. consider the following scenario: first_sg := 14 |=> shift := -12 msg->sg_start := 10 last_sg := 3 | msg->sg_end := 5 round 1: i := 15, move_from := 3, sg[15] := sg[ 3] round 2: i := 0, move_from := -12, sg[ 0] := sg[-12] round 3: i := 1, move_from := -11, sg[ 1] := sg[-11] round 4: i := 2, move_from := -10, sg[ 2] := sg[-10] [...] round 13: i := 11, move_from := -1, sg[ 2] := sg[ -1] round 14: i := 12, move_from := 0, sg[ 2] := sg[ 0] round 15: i := 13, move_from := 1, sg[ 2] := sg[ 1] round 16: i := 14, move_from := 2, sg[ 2] := sg[ 2] round 17: i := 15, move_from := 3, sg[ 2] := sg[ 3] [...] This means we will loop forever and never hit the msg->sg_end condition to break out of the loop. When we see that the ring wraps around, then the shift should be MAX_SKB_FRAGS - first_sg + last_sg - 1. Meaning, the remainder slots from the tail of the ring and the head until last_sg combined. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index b9225c55926a..43ba5f8c38ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2370,7 +2370,10 @@ BPF_CALL_4(bpf_msg_pull_data, * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - shift = last_sg - first_sg - 1; + WARN_ON_ONCE(last_sg == first_sg); + shift = last_sg > first_sg ? + last_sg - first_sg - 1 : + MAX_SKB_FRAGS - first_sg + last_sg - 1; if (!shift) goto out; From a8cf76a9023bc6709b1361d06bb2fae5227b9d68 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:36 +0200 Subject: [PATCH 29/91] bpf: fix sg shift repair start offset in bpf_msg_pull_data When we perform the sg shift repair for the scatterlist ring, we currently start out at i = first_sg + 1. However, this is not correct since the first_sg could point to the sge sitting at slot MAX_SKB_FRAGS - 1, and a subsequent i = MAX_SKB_FRAGS will access the scatterlist ring (sg) out of bounds. Add the sk_msg_iter_var() helper for iterating through the ring, and apply the same rule for advancing to the next ring element as we do elsewhere. Later work will use this helper also in other places. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 43ba5f8c38ca..2c7801f6737a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2282,6 +2282,13 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +#define sk_msg_iter_var(var) \ + do { \ + var++; \ + if (var == MAX_SKB_FRAGS) \ + var = 0; \ + } while (0) + BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { @@ -2302,9 +2309,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (start < offset + len) break; offset += len; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != msg->sg_end); if (unlikely(start >= offset + len)) @@ -2330,9 +2335,7 @@ BPF_CALL_4(bpf_msg_pull_data, */ do { copy += sg[i].length; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); @@ -2358,9 +2361,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[i].length = 0; put_page(sg_page(&sg[i])); - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != last_sg); sg[first_sg].length = copy; @@ -2377,7 +2378,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (!shift) goto out; - i = first_sg + 1; + i = first_sg; + sk_msg_iter_var(i); do { int move_from; @@ -2394,9 +2396,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[move_from].page_link = 0; sg[move_from].offset = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (1); msg->sg_end -= shift; if (msg->sg_end < 0) From 9f2895461439fda2801a7906fb4c5fb3dbb37a0a Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 23 Aug 2018 19:49:54 +0300 Subject: [PATCH 30/91] vti6: remove !skb->ignore_df check from vti6_xmit() Before the commit d6990976af7c ("vti6: fix PMTU caching and reporting on xmit") '!skb->ignore_df' check was always true because the function skb_scrub_packet() was called before it, resetting ignore_df to zero. In the commit, skb_scrub_packet() was moved below, and now this check can be false for the packet, e.g. when sending it in the two fragments, this prevents successful PMTU updates in such case. The next attempts to send the packet lead to the same tx error. Moreover, vti6 initial MTU value relies on PMTU adjustments. This issue can be reproduced with the following LTP test script: udp_ipsec_vti.sh -6 -p ah -m tunnel -s 2000 Fixes: ccd740cbc6e0 ("vti6: Add pmtu handling to vti6_xmit.") Signed-off-by: Alexey Kodanev Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5095367c7204..eeaf7455d51e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -481,7 +481,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } mtu = dst_mtu(dst); - if (!skb->ignore_df && skb->len > mtu) { + if (skb->len > mtu) { skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { From bd583fe30427500a2d0abe25724025b1cb5e2636 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 23 Aug 2018 16:19:44 -0700 Subject: [PATCH 31/91] tipc: fix a missing rhashtable_walk_exit() rhashtable_walk_exit() must be paired with rhashtable_walk_enter(). Fixes: 40f9f4397060 ("tipc: Fix tipc_sk_reinit race conditions") Cc: Herbert Xu Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index c1e93c9515bc..c9a50b62c738 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2672,6 +2672,8 @@ void tipc_sk_reinit(struct net *net) rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) From e5133f2f1261f8ab412e7fc5e3694c9f84328f89 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Fri, 24 Aug 2018 11:04:40 +0200 Subject: [PATCH 32/91] Revert "net: stmmac: Do not keep rearming the coalesce timer in stmmac_xmit" This reverts commit 4ae0169fd1b3c792b66be58995b7e6b629919ecf. This change in the handling of the coalesce timer is causing regression on (at least) amlogic platforms. Network will break down very quickly (a few seconds) after starting a download. This can easily be reproduced using iperf3 for example. The problem has been reported on the S805, S905, S912 and A113 SoCs (Realtek and Micrel PHYs) and it is likely impacting all Amlogics platforms using Gbit ethernet No problem was seen with the platform using 10/100 only PHYs (GXL internal) Reverting change brings things back to normal and allows to use network again until we better understand the problem with the coalesce timer. Cc: Jose Abreu Cc: Joao Pinto Cc: Vitor Soares Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Cc: Corentin Labbe Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 - drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 76649adf8fb0..c0a855b7ab3b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -112,7 +112,6 @@ struct stmmac_priv { u32 tx_count_frames; u32 tx_coal_frames; u32 tx_coal_timer; - bool tx_timer_armed; int tx_coalesce; int hwts_tx_en; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ff1ffb46198a..9f458bb16f2a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3147,16 +3147,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) * element in case of no SG. */ priv->tx_count_frames += nfrags + 1; - if (likely(priv->tx_coal_frames > priv->tx_count_frames) && - !priv->tx_timer_armed) { + if (likely(priv->tx_coal_frames > priv->tx_count_frames)) { mod_timer(&priv->txtimer, STMMAC_COAL_TIMER(priv->tx_coal_timer)); - priv->tx_timer_armed = true; } else { priv->tx_count_frames = 0; stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; - priv->tx_timer_armed = false; } skb_tx_timestamp(skb); From 9a07efa9aea2f4a59f35da0785a4e6a6b5a96192 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 24 Aug 2018 12:28:06 -0700 Subject: [PATCH 33/91] tipc: switch to rhashtable iterator syzbot reported a use-after-free in tipc_group_fill_sock_diag(), where tipc_group_fill_sock_diag() still reads tsk->group meanwhile tipc_group_delete() just deletes it in tipc_release(). tipc_nl_sk_walk() aims to lock this sock when walking each sock in the hash table to close race conditions with sock changes like this one, by acquiring tsk->sk.sk_lock.slock spinlock, unfortunately this doesn't work at all. All non-BH call path should take lock_sock() instead to make it work. tipc_nl_sk_walk() brutally iterates with raw rht_for_each_entry_rcu() where RCU read lock is required, this is the reason why lock_sock() can't be taken on this path. This could be resolved by switching to rhashtable iterator API's, where taking a sleepable lock is possible. Also, the iterator API's are friendly for restartable calls like diag dump, the last position is remembered behind the scence, all we need to do here is saving the iterator into cb->args[]. I tested this with parallel tipc diag dump and thousands of tipc socket creation and release, no crash or memory leak. Reported-by: syzbot+b9c8f3ab2994b7cd1625@syzkaller.appspotmail.com Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/diag.c | 2 ++ net/tipc/netlink.c | 2 ++ net/tipc/socket.c | 80 ++++++++++++++++++++++++++++++---------------- net/tipc/socket.h | 2 ++ 4 files changed, 58 insertions(+), 28 deletions(-) diff --git a/net/tipc/diag.c b/net/tipc/diag.c index aaabb0b776dd..73137f4aeb68 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -84,7 +84,9 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb, if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = tipc_dump_start, .dump = tipc_diag_dump, + .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 6ff2254088f6..99ee419210ba 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -167,7 +167,9 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_SOCK_GET, + .start = tipc_dump_start, .dumpit = tipc_nl_sk_dump, + .done = tipc_dump_done, .policy = tipc_nl_policy, }, { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index c9a50b62c738..ab7a2a7178f7 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3229,45 +3229,69 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, struct netlink_callback *cb, struct tipc_sock *tsk)) { - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = tipc_net(net); - const struct bucket_table *tbl; - u32 prev_portid = cb->args[1]; - u32 tbl_id = cb->args[0]; - struct rhash_head *pos; + struct rhashtable_iter *iter = (void *)cb->args[0]; struct tipc_sock *tsk; int err; - rcu_read_lock(); - tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); - for (; tbl_id < tbl->size; tbl_id++) { - rht_for_each_entry_rcu(tsk, pos, tbl, tbl_id, node) { - spin_lock_bh(&tsk->sk.sk_lock.slock); - if (prev_portid && prev_portid != tsk->portid) { - spin_unlock_bh(&tsk->sk.sk_lock.slock); + rhashtable_walk_start(iter); + while ((tsk = rhashtable_walk_next(iter)) != NULL) { + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + if (err == -EAGAIN) { + err = 0; continue; } - - err = skb_handler(skb, cb, tsk); - if (err) { - prev_portid = tsk->portid; - spin_unlock_bh(&tsk->sk.sk_lock.slock); - goto out; - } - - prev_portid = 0; - spin_unlock_bh(&tsk->sk.sk_lock.slock); + break; } - } -out: - rcu_read_unlock(); - cb->args[0] = tbl_id; - cb->args[1] = prev_portid; + sock_hold(&tsk->sk); + rhashtable_walk_stop(iter); + lock_sock(&tsk->sk); + err = skb_handler(skb, cb, tsk); + if (err) { + release_sock(&tsk->sk); + sock_put(&tsk->sk); + goto out; + } + release_sock(&tsk->sk); + rhashtable_walk_start(iter); + sock_put(&tsk->sk); + } + rhashtable_walk_stop(iter); +out: return skb->len; } EXPORT_SYMBOL(tipc_nl_sk_walk); +int tipc_dump_start(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (void *)cb->args[0]; + struct net *net = sock_net(cb->skb->sk); + struct tipc_net *tn = tipc_net(net); + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&tn->sk_rht, iter); + return 0; +} +EXPORT_SYMBOL(tipc_dump_start); + +int tipc_dump_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *hti = (void *)cb->args[0]; + + rhashtable_walk_exit(hti); + kfree(hti); + return 0; +} +EXPORT_SYMBOL(tipc_dump_done); + int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk, u32 sk_filter_state, u64 (*tipc_diag_gen_cookie)(struct sock *sk)) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index aff9b2ae5a1f..d43032e26532 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -68,4 +68,6 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, int (*skb_handler)(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk)); +int tipc_dump_start(struct netlink_callback *cb); +int tipc_dump_done(struct netlink_callback *cb); #endif From 05212ba8132b42047ab5d63d759c6f9c28e7eab5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 26 Aug 2018 17:03:09 +0300 Subject: [PATCH 34/91] r8169: set RxConfig after tx/rx is enabled for RTL8169sb/8110sb devices I have two Ethernet adapters: r8169 0000:03:01.0 eth0: RTL8169sb/8110sb, 00:14:d1:14:2d:49, XID 10000000, IRQ 18 r8169 0000:01:00.0 eth0: RTL8168e/8111e, 64:66:b3:11:14:5d, XID 2c200000, IRQ 30 And after upgrading from linux 4.15 [1] to linux 4.18+ [2] RTL8169sb failed to receive any packets. tcpdump shows a lot of checksum mismatch. [1]: a0f79386a4968b4925da6db2d1daffd0605a4402 [2]: 0519359784328bfa92bf0931bf0cff3b58c16932 (4.19 merge window opened) I started bisecting and the found that [3] breaks it. According to [4]: "For 8110S, 8110SB, and 8110SC series, the initial value of RxConfig needs to be set after the tx/rx is enabled." So I moved rtl_init_rxcfg() after enabling tx/rs and now my adapter works (RTL8168e works too). [3]: 3559d81e76bfe3803e89f2e04cf6ef7ab4f3aace [4]: e542a2269f232d61270ceddd42b73a4348dee2bb ("r8169: adjust the RxConfig settings.") Also drop "rx" from rtl_set_rx_tx_config_registers(), since it does nothing with it already. Fixes: 3559d81e76bfe3803e89f2e04cf6ef7ab4f3aace ("r8169: simplify rtl_hw_start_8169") Cc: Heiner Kallweit Cc: David S. Miller Cc: netdev@vger.kernel.org Cc: Realtek linux nic maintainers Signed-off-by: Azat Khuzhin Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 0efa977c422d..ac306797590e 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4522,7 +4522,7 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp) rtl_hw_reset(tp); } -static void rtl_set_rx_tx_config_registers(struct rtl8169_private *tp) +static void rtl_set_tx_config_registers(struct rtl8169_private *tp) { /* Set DMA burst size and Interframe Gap Time */ RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) | @@ -4633,12 +4633,14 @@ static void rtl_hw_start(struct rtl8169_private *tp) rtl_set_rx_max_size(tp); rtl_set_rx_tx_desc_registers(tp); - rtl_set_rx_tx_config_registers(tp); + rtl_set_tx_config_registers(tp); RTL_W8(tp, Cfg9346, Cfg9346_Lock); /* Initially a 10 us delay. Turned it into a PCI commit. - FR */ RTL_R8(tp, IntrMask); RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb); + rtl_init_rxcfg(tp); + rtl_set_rx_mode(tp->dev); /* no early-rx interrupts */ RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000); From 31fabbee8f5c658c3fa1603c66e9e4f51ea8c2c6 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 27 Aug 2018 09:59:29 +0800 Subject: [PATCH 35/91] net: hns: add the code for cleaning pkt in chip If there are packets in hardware when changing the speed or duplex, it may cause hardware hang up. This patch adds the code for waiting chip to clean the all pkts(TX & RX) in chip when the driver uses the function named "adjust link". This patch cleans the pkts as follows: 1) close rx of chip, close tx of protocol stack. 2) wait rcb, ppe, mac to clean. 3) adjust link 4) open rx of chip, open tx of protocol stack. Signed-off-by: Peng Li Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hnae.h | 2 + .../net/ethernet/hisilicon/hns/hns_ae_adapt.c | 67 ++++++++++++++++++- .../ethernet/hisilicon/hns/hns_dsaf_gmac.c | 36 ++++++++++ .../net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 44 ++++++++++++ .../net/ethernet/hisilicon/hns/hns_dsaf_mac.h | 8 +++ .../ethernet/hisilicon/hns/hns_dsaf_main.c | 29 ++++++++ .../ethernet/hisilicon/hns/hns_dsaf_main.h | 3 + .../net/ethernet/hisilicon/hns/hns_dsaf_ppe.c | 23 +++++++ .../net/ethernet/hisilicon/hns/hns_dsaf_ppe.h | 1 + .../net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 23 +++++++ .../net/ethernet/hisilicon/hns/hns_dsaf_rcb.h | 1 + .../net/ethernet/hisilicon/hns/hns_dsaf_reg.h | 1 + drivers/net/ethernet/hisilicon/hns/hns_enet.c | 21 +++++- 13 files changed, 255 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h index cad52bd331f7..08a750fb60c4 100644 --- a/drivers/net/ethernet/hisilicon/hns/hnae.h +++ b/drivers/net/ethernet/hisilicon/hns/hnae.h @@ -486,6 +486,8 @@ struct hnae_ae_ops { u8 *auto_neg, u16 *speed, u8 *duplex); void (*toggle_ring_irq)(struct hnae_ring *ring, u32 val); void (*adjust_link)(struct hnae_handle *handle, int speed, int duplex); + bool (*need_adjust_link)(struct hnae_handle *handle, + int speed, int duplex); int (*set_loopback)(struct hnae_handle *handle, enum hnae_loop loop_mode, int en); void (*get_ring_bdnum_limit)(struct hnae_queue *queue, diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c index e6aad30e7e69..b52029e26d15 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c @@ -155,6 +155,41 @@ static void hns_ae_put_handle(struct hnae_handle *handle) hns_ae_get_ring_pair(handle->qs[i])->used_by_vf = 0; } +static int hns_ae_wait_flow_down(struct hnae_handle *handle) +{ + struct dsaf_device *dsaf_dev; + struct hns_ppe_cb *ppe_cb; + struct hnae_vf_cb *vf_cb; + int ret; + int i; + + for (i = 0; i < handle->q_num; i++) { + ret = hns_rcb_wait_tx_ring_clean(handle->qs[i]); + if (ret) + return ret; + } + + ppe_cb = hns_get_ppe_cb(handle); + ret = hns_ppe_wait_tx_fifo_clean(ppe_cb); + if (ret) + return ret; + + dsaf_dev = hns_ae_get_dsaf_dev(handle->dev); + if (!dsaf_dev) + return -EINVAL; + ret = hns_dsaf_wait_pkt_clean(dsaf_dev, handle->dport_id); + if (ret) + return ret; + + vf_cb = hns_ae_get_vf_cb(handle); + ret = hns_mac_wait_fifo_clean(vf_cb->mac_cb); + if (ret) + return ret; + + mdelay(10); + return 0; +} + static void hns_ae_ring_enable_all(struct hnae_handle *handle, int val) { int q_num = handle->q_num; @@ -399,12 +434,41 @@ static int hns_ae_get_mac_info(struct hnae_handle *handle, return hns_mac_get_port_info(mac_cb, auto_neg, speed, duplex); } +static bool hns_ae_need_adjust_link(struct hnae_handle *handle, int speed, + int duplex) +{ + struct hns_mac_cb *mac_cb = hns_get_mac_cb(handle); + + return hns_mac_need_adjust_link(mac_cb, speed, duplex); +} + static void hns_ae_adjust_link(struct hnae_handle *handle, int speed, int duplex) { struct hns_mac_cb *mac_cb = hns_get_mac_cb(handle); - hns_mac_adjust_link(mac_cb, speed, duplex); + switch (mac_cb->dsaf_dev->dsaf_ver) { + case AE_VERSION_1: + hns_mac_adjust_link(mac_cb, speed, duplex); + break; + + case AE_VERSION_2: + /* chip need to clear all pkt inside */ + hns_mac_disable(mac_cb, MAC_COMM_MODE_RX); + if (hns_ae_wait_flow_down(handle)) { + hns_mac_enable(mac_cb, MAC_COMM_MODE_RX); + break; + } + + hns_mac_adjust_link(mac_cb, speed, duplex); + hns_mac_enable(mac_cb, MAC_COMM_MODE_RX); + break; + + default: + break; + } + + return; } static void hns_ae_get_ring_bdnum_limit(struct hnae_queue *queue, @@ -902,6 +966,7 @@ static struct hnae_ae_ops hns_dsaf_ops = { .get_status = hns_ae_get_link_status, .get_info = hns_ae_get_mac_info, .adjust_link = hns_ae_adjust_link, + .need_adjust_link = hns_ae_need_adjust_link, .set_loopback = hns_ae_config_loopback, .get_ring_bdnum_limit = hns_ae_get_ring_bdnum_limit, .get_pauseparam = hns_ae_get_pauseparam, diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c index 5488c6e89f21..09e4061d1fa6 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c @@ -257,6 +257,16 @@ static void hns_gmac_get_pausefrm_cfg(void *mac_drv, u32 *rx_pause_en, *tx_pause_en = dsaf_get_bit(pause_en, GMAC_PAUSE_EN_TX_FDFC_B); } +static bool hns_gmac_need_adjust_link(void *mac_drv, enum mac_speed speed, + int duplex) +{ + struct mac_driver *drv = (struct mac_driver *)mac_drv; + struct hns_mac_cb *mac_cb = drv->mac_cb; + + return (mac_cb->speed != speed) || + (mac_cb->half_duplex == duplex); +} + static int hns_gmac_adjust_link(void *mac_drv, enum mac_speed speed, u32 full_duplex) { @@ -309,6 +319,30 @@ static void hns_gmac_set_promisc(void *mac_drv, u8 en) hns_gmac_set_uc_match(mac_drv, en); } +int hns_gmac_wait_fifo_clean(void *mac_drv) +{ + struct mac_driver *drv = (struct mac_driver *)mac_drv; + int wait_cnt; + u32 val; + + wait_cnt = 0; + while (wait_cnt++ < HNS_MAX_WAIT_CNT) { + val = dsaf_read_dev(drv, GMAC_FIFO_STATE_REG); + /* bit5~bit0 is not send complete pkts */ + if ((val & 0x3f) == 0) + break; + usleep_range(100, 200); + } + + if (wait_cnt >= HNS_MAX_WAIT_CNT) { + dev_err(drv->dev, + "hns ge %d fifo was not idle.\n", drv->mac_id); + return -EBUSY; + } + + return 0; +} + static void hns_gmac_init(void *mac_drv) { u32 port; @@ -690,6 +724,7 @@ void *hns_gmac_config(struct hns_mac_cb *mac_cb, struct mac_params *mac_param) mac_drv->mac_disable = hns_gmac_disable; mac_drv->mac_free = hns_gmac_free; mac_drv->adjust_link = hns_gmac_adjust_link; + mac_drv->need_adjust_link = hns_gmac_need_adjust_link; mac_drv->set_tx_auto_pause_frames = hns_gmac_set_tx_auto_pause_frames; mac_drv->config_max_frame_length = hns_gmac_config_max_frame_length; mac_drv->mac_pausefrm_cfg = hns_gmac_pause_frm_cfg; @@ -717,6 +752,7 @@ void *hns_gmac_config(struct hns_mac_cb *mac_cb, struct mac_params *mac_param) mac_drv->get_strings = hns_gmac_get_strings; mac_drv->update_stats = hns_gmac_update_stats; mac_drv->set_promiscuous = hns_gmac_set_promisc; + mac_drv->wait_fifo_clean = hns_gmac_wait_fifo_clean; return (void *)mac_drv; } diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c index 1c2326bd76e2..6ed6f142427e 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c @@ -114,6 +114,26 @@ int hns_mac_get_port_info(struct hns_mac_cb *mac_cb, return 0; } +/** + *hns_mac_is_adjust_link - check is need change mac speed and duplex register + *@mac_cb: mac device + *@speed: phy device speed + *@duplex:phy device duplex + * + */ +bool hns_mac_need_adjust_link(struct hns_mac_cb *mac_cb, int speed, int duplex) +{ + struct mac_driver *mac_ctrl_drv; + + mac_ctrl_drv = (struct mac_driver *)(mac_cb->priv.mac); + + if (mac_ctrl_drv->need_adjust_link) + return mac_ctrl_drv->need_adjust_link(mac_ctrl_drv, + (enum mac_speed)speed, duplex); + else + return true; +} + void hns_mac_adjust_link(struct hns_mac_cb *mac_cb, int speed, int duplex) { int ret; @@ -430,6 +450,16 @@ int hns_mac_vm_config_bc_en(struct hns_mac_cb *mac_cb, u32 vmid, bool enable) return 0; } +int hns_mac_wait_fifo_clean(struct hns_mac_cb *mac_cb) +{ + struct mac_driver *drv = hns_mac_get_drv(mac_cb); + + if (drv->wait_fifo_clean) + return drv->wait_fifo_clean(drv); + + return 0; +} + void hns_mac_reset(struct hns_mac_cb *mac_cb) { struct mac_driver *drv = hns_mac_get_drv(mac_cb); @@ -998,6 +1028,20 @@ static int hns_mac_get_max_port_num(struct dsaf_device *dsaf_dev) return DSAF_MAX_PORT_NUM; } +void hns_mac_enable(struct hns_mac_cb *mac_cb, enum mac_commom_mode mode) +{ + struct mac_driver *mac_ctrl_drv = hns_mac_get_drv(mac_cb); + + mac_ctrl_drv->mac_enable(mac_cb->priv.mac, mode); +} + +void hns_mac_disable(struct hns_mac_cb *mac_cb, enum mac_commom_mode mode) +{ + struct mac_driver *mac_ctrl_drv = hns_mac_get_drv(mac_cb); + + mac_ctrl_drv->mac_disable(mac_cb->priv.mac, mode); +} + /** * hns_mac_init - init mac * @dsaf_dev: dsa fabric device struct pointer diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h index bbc0a98e7ca3..fbc75341bef7 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h @@ -356,6 +356,9 @@ struct mac_driver { /*adjust mac mode of port,include speed and duplex*/ int (*adjust_link)(void *mac_drv, enum mac_speed speed, u32 full_duplex); + /* need adjust link */ + bool (*need_adjust_link)(void *mac_drv, enum mac_speed speed, + int duplex); /* config autoegotaite mode of port*/ void (*set_an_mode)(void *mac_drv, u8 enable); /* config loopbank mode */ @@ -394,6 +397,7 @@ struct mac_driver { void (*get_info)(void *mac_drv, struct mac_info *mac_info); void (*update_stats)(void *mac_drv); + int (*wait_fifo_clean)(void *mac_drv); enum mac_mode mac_mode; u8 mac_id; @@ -427,6 +431,7 @@ void *hns_xgmac_config(struct hns_mac_cb *mac_cb, int hns_mac_init(struct dsaf_device *dsaf_dev); void mac_adjust_link(struct net_device *net_dev); +bool hns_mac_need_adjust_link(struct hns_mac_cb *mac_cb, int speed, int duplex); void hns_mac_get_link_status(struct hns_mac_cb *mac_cb, u32 *link_status); int hns_mac_change_vf_addr(struct hns_mac_cb *mac_cb, u32 vmid, char *addr); int hns_mac_set_multi(struct hns_mac_cb *mac_cb, @@ -463,5 +468,8 @@ int hns_mac_add_uc_addr(struct hns_mac_cb *mac_cb, u8 vf_id, int hns_mac_rm_uc_addr(struct hns_mac_cb *mac_cb, u8 vf_id, const unsigned char *addr); int hns_mac_clr_multicast(struct hns_mac_cb *mac_cb, int vfn); +void hns_mac_enable(struct hns_mac_cb *mac_cb, enum mac_commom_mode mode); +void hns_mac_disable(struct hns_mac_cb *mac_cb, enum mac_commom_mode mode); +int hns_mac_wait_fifo_clean(struct hns_mac_cb *mac_cb); #endif /* _HNS_DSAF_MAC_H */ diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c index ca50c2553a9c..e557a4ef5996 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c @@ -2727,6 +2727,35 @@ void hns_dsaf_set_promisc_tcam(struct dsaf_device *dsaf_dev, soft_mac_entry->index = enable ? entry_index : DSAF_INVALID_ENTRY_IDX; } +int hns_dsaf_wait_pkt_clean(struct dsaf_device *dsaf_dev, int port) +{ + u32 val, val_tmp; + int wait_cnt; + + if (port >= DSAF_SERVICE_NW_NUM) + return 0; + + wait_cnt = 0; + while (wait_cnt++ < HNS_MAX_WAIT_CNT) { + val = dsaf_read_dev(dsaf_dev, DSAF_VOQ_IN_PKT_NUM_0_REG + + (port + DSAF_XGE_NUM) * 0x40); + val_tmp = dsaf_read_dev(dsaf_dev, DSAF_VOQ_OUT_PKT_NUM_0_REG + + (port + DSAF_XGE_NUM) * 0x40); + if (val == val_tmp) + break; + + usleep_range(100, 200); + } + + if (wait_cnt >= HNS_MAX_WAIT_CNT) { + dev_err(dsaf_dev->dev, "hns dsaf clean wait timeout(%u - %u).\n", + val, val_tmp); + return -EBUSY; + } + + return 0; +} + /** * dsaf_probe - probo dsaf dev * @pdev: dasf platform device diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h index 4507e8222683..0e1cd99831a6 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h @@ -44,6 +44,8 @@ struct hns_mac_cb; #define DSAF_ROCE_CREDIT_CHN 8 #define DSAF_ROCE_CHAN_MODE 3 +#define HNS_MAX_WAIT_CNT 10000 + enum dsaf_roce_port_mode { DSAF_ROCE_6PORT_MODE, DSAF_ROCE_4PORT_MODE, @@ -463,5 +465,6 @@ int hns_dsaf_rm_mac_addr( int hns_dsaf_clr_mac_mc_port(struct dsaf_device *dsaf_dev, u8 mac_id, u8 port_num); +int hns_dsaf_wait_pkt_clean(struct dsaf_device *dsaf_dev, int port); #endif /* __HNS_DSAF_MAIN_H__ */ diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c index d160d8c9e45b..0942e4916d9d 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c @@ -275,6 +275,29 @@ static void hns_ppe_exc_irq_en(struct hns_ppe_cb *ppe_cb, int en) dsaf_write_dev(ppe_cb, PPE_INTEN_REG, msk_vlue & vld_msk); } +int hns_ppe_wait_tx_fifo_clean(struct hns_ppe_cb *ppe_cb) +{ + int wait_cnt; + u32 val; + + wait_cnt = 0; + while (wait_cnt++ < HNS_MAX_WAIT_CNT) { + val = dsaf_read_dev(ppe_cb, PPE_CURR_TX_FIFO0_REG) & 0x3ffU; + if (!val) + break; + + usleep_range(100, 200); + } + + if (wait_cnt >= HNS_MAX_WAIT_CNT) { + dev_err(ppe_cb->dev, "hns ppe tx fifo clean wait timeout, still has %u pkt.\n", + val); + return -EBUSY; + } + + return 0; +} + /** * ppe_init_hw - init ppe * @ppe_cb: ppe device diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h index 9d8e643e8aa6..f670e63a5a01 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h @@ -100,6 +100,7 @@ struct ppe_common_cb { }; +int hns_ppe_wait_tx_fifo_clean(struct hns_ppe_cb *ppe_cb); int hns_ppe_init(struct dsaf_device *dsaf_dev); void hns_ppe_uninit(struct dsaf_device *dsaf_dev); diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c index 9d76e2e54f9d..5d64519b9b1d 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c @@ -66,6 +66,29 @@ void hns_rcb_wait_fbd_clean(struct hnae_queue **qs, int q_num, u32 flag) "queue(%d) wait fbd(%d) clean fail!!\n", i, fbd_num); } +int hns_rcb_wait_tx_ring_clean(struct hnae_queue *qs) +{ + u32 head, tail; + int wait_cnt; + + tail = dsaf_read_dev(&qs->tx_ring, RCB_REG_TAIL); + wait_cnt = 0; + while (wait_cnt++ < HNS_MAX_WAIT_CNT) { + head = dsaf_read_dev(&qs->tx_ring, RCB_REG_HEAD); + if (tail == head) + break; + + usleep_range(100, 200); + } + + if (wait_cnt >= HNS_MAX_WAIT_CNT) { + dev_err(qs->dev->dev, "rcb wait timeout, head not equal to tail.\n"); + return -EBUSY; + } + + return 0; +} + /** *hns_rcb_reset_ring_hw - ring reset *@q: ring struct pointer diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h index 602816498c8d..2319b772a271 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h @@ -136,6 +136,7 @@ void hns_rcbv2_int_clr_hw(struct hnae_queue *q, u32 flag); void hns_rcb_init_hw(struct ring_pair_cb *ring); void hns_rcb_reset_ring_hw(struct hnae_queue *q); void hns_rcb_wait_fbd_clean(struct hnae_queue **qs, int q_num, u32 flag); +int hns_rcb_wait_tx_ring_clean(struct hnae_queue *qs); u32 hns_rcb_get_rx_coalesced_frames( struct rcb_common_cb *rcb_common, u32 port_idx); u32 hns_rcb_get_tx_coalesced_frames( diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h index 886cbbf25761..74d935d82cbc 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h @@ -464,6 +464,7 @@ #define RCB_RING_INTMSK_TX_OVERTIME_REG 0x000C4 #define RCB_RING_INTSTS_TX_OVERTIME_REG 0x000C8 +#define GMAC_FIFO_STATE_REG 0x0000UL #define GMAC_DUPLEX_TYPE_REG 0x0008UL #define GMAC_FD_FC_TYPE_REG 0x000CUL #define GMAC_TX_WATER_LINE_REG 0x0010UL diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 02a0ba20fad5..f56855e63c96 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -1112,11 +1112,26 @@ static void hns_nic_adjust_link(struct net_device *ndev) struct hnae_handle *h = priv->ae_handle; int state = 1; + /* If there is no phy, do not need adjust link */ if (ndev->phydev) { - h->dev->ops->adjust_link(h, ndev->phydev->speed, - ndev->phydev->duplex); - state = ndev->phydev->link; + /* When phy link down, do nothing */ + if (ndev->phydev->link == 0) + return; + + if (h->dev->ops->need_adjust_link(h, ndev->phydev->speed, + ndev->phydev->duplex)) { + /* because Hi161X chip don't support to change gmac + * speed and duplex with traffic. Delay 200ms to + * make sure there is no more data in chip FIFO. + */ + netif_carrier_off(ndev); + msleep(200); + h->dev->ops->adjust_link(h, ndev->phydev->speed, + ndev->phydev->duplex); + netif_carrier_on(ndev); + } } + state = state && h->dev->ops->get_status(h); if (state != priv->link) { From 455c4401fe7a538facaffb35b906ce19f1ece474 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Mon, 27 Aug 2018 09:59:30 +0800 Subject: [PATCH 36/91] net: hns: add netif_carrier_off before change speed and duplex If there are packets in hardware when changing the speed or duplex, it may cause hardware hang up. This patch adds netif_carrier_off before change speed and duplex in ethtool_ops.set_link_ksettings, and adds netif_carrier_on after complete the change. Signed-off-by: Peng Li Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index 08f3c4743f74..774beda040a1 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -243,7 +243,9 @@ static int hns_nic_set_link_ksettings(struct net_device *net_dev, } if (h->dev->ops->adjust_link) { + netif_carrier_off(net_dev); h->dev->ops->adjust_link(h, (int)speed, cmd->base.duplex); + netif_carrier_on(net_dev); return 0; } From 6e0bb04d0e4f597d8d8f4f21401a9636f2809fd1 Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Mon, 27 Aug 2018 12:42:02 -0500 Subject: [PATCH 37/91] sh_eth: Add R7S9210 support Add support for the R7S9210 which is part of the RZ/A2 series. Signed-off-by: Chris Brandt Acked-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/sh_eth.txt | 1 + drivers/net/ethernet/renesas/sh_eth.c | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/Documentation/devicetree/bindings/net/sh_eth.txt b/Documentation/devicetree/bindings/net/sh_eth.txt index 76db9f13ad96..abc36274227c 100644 --- a/Documentation/devicetree/bindings/net/sh_eth.txt +++ b/Documentation/devicetree/bindings/net/sh_eth.txt @@ -16,6 +16,7 @@ Required properties: "renesas,ether-r8a7794" if the device is a part of R8A7794 SoC. "renesas,gether-r8a77980" if the device is a part of R8A77980 SoC. "renesas,ether-r7s72100" if the device is a part of R7S72100 SoC. + "renesas,ether-r7s9210" if the device is a part of R7S9210 SoC. "renesas,rcar-gen1-ether" for a generic R-Car Gen1 device. "renesas,rcar-gen2-ether" for a generic R-Car Gen2 or RZ/G1 device. diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index ad4433d59237..f27a0dc8c563 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -798,6 +798,41 @@ static struct sh_eth_cpu_data r8a77980_data = { .magic = 1, .cexcr = 1, }; + +/* R7S9210 */ +static struct sh_eth_cpu_data r7s9210_data = { + .soft_reset = sh_eth_soft_reset, + + .set_duplex = sh_eth_set_duplex, + .set_rate = sh_eth_set_rate_rcar, + + .register_type = SH_ETH_REG_FAST_SH4, + + .edtrr_trns = EDTRR_TRNS_ETHER, + .ecsr_value = ECSR_ICD, + .ecsipr_value = ECSIPR_ICDIP, + .eesipr_value = EESIPR_TWBIP | EESIPR_TABTIP | EESIPR_RABTIP | + EESIPR_RFCOFIP | EESIPR_ECIIP | EESIPR_FTCIP | + EESIPR_TDEIP | EESIPR_TFUFIP | EESIPR_FRIP | + EESIPR_RDEIP | EESIPR_RFOFIP | EESIPR_CNDIP | + EESIPR_DLCIP | EESIPR_CDIP | EESIPR_TROIP | + EESIPR_RMAFIP | EESIPR_RRFIP | EESIPR_RTLFIP | + EESIPR_RTSFIP | EESIPR_PREIP | EESIPR_CERFIP, + + .tx_check = EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO, + .eesr_err_check = EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE | + EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE, + + .fdr_value = 0x0000070f, + + .apr = 1, + .mpr = 1, + .tpauser = 1, + .hw_swap = 1, + .rpadir = 1, + .no_ade = 1, + .xdfar_rw = 1, +}; #endif /* CONFIG_OF */ static void sh_eth_set_rate_sh7724(struct net_device *ndev) @@ -3121,6 +3156,7 @@ static const struct of_device_id sh_eth_match_table[] = { { .compatible = "renesas,ether-r8a7794", .data = &rcar_gen2_data }, { .compatible = "renesas,gether-r8a77980", .data = &r8a77980_data }, { .compatible = "renesas,ether-r7s72100", .data = &r7s72100_data }, + { .compatible = "renesas,ether-r7s9210", .data = &r7s9210_data }, { .compatible = "renesas,rcar-gen1-ether", .data = &rcar_gen1_data }, { .compatible = "renesas,rcar-gen2-ether", .data = &rcar_gen2_data }, { } From 85eb9af182243ce9a8b72410d5321c440ac5f8d7 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 27 Aug 2018 22:56:22 +0200 Subject: [PATCH 38/91] net/sched: act_pedit: fix dump of extended layered op in the (rare) case of failure in nla_nest_start(), missing NULL checks in tcf_pedit_key_ex_dump() can make the following command # tc action add action pedit ex munge ip ttl set 64 dereference a NULL pointer: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000007d1cd067 P4D 800000007d1cd067 PUD 7acd3067 PMD 0 Oops: 0002 [#1] SMP PTI CPU: 0 PID: 3336 Comm: tc Tainted: G E 4.18.0.pedit+ #425 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_pedit_dump+0x19d/0x358 [act_pedit] Code: be 02 00 00 00 48 89 df 66 89 44 24 20 e8 9b b1 fd e0 85 c0 75 46 8b 83 c8 00 00 00 49 83 c5 08 48 03 83 d0 00 00 00 4d 39 f5 <66> 89 04 25 00 00 00 00 0f 84 81 01 00 00 41 8b 45 00 48 8d 4c 24 RSP: 0018:ffffb5d4004478a8 EFLAGS: 00010246 RAX: ffff8880fcda2070 RBX: ffff8880fadd2900 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffb5d4004478ca RDI: ffff8880fcda206e RBP: ffff8880fb9cb900 R08: 0000000000000008 R09: ffff8880fcda206e R10: ffff8880fadd2900 R11: 0000000000000000 R12: ffff8880fd26cf40 R13: ffff8880fc957430 R14: ffff8880fc957430 R15: ffff8880fb9cb988 FS: 00007f75a537a740(0000) GS:ffff8880fda00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007a2fa005 CR4: 00000000001606f0 Call Trace: ? __nla_reserve+0x38/0x50 tcf_action_dump_1+0xd2/0x130 tcf_action_dump+0x6a/0xf0 tca_get_fill.constprop.31+0xa3/0x120 tcf_action_add+0xd1/0x170 tc_ctl_action+0x137/0x150 rtnetlink_rcv_msg+0x263/0x2d0 ? _cond_resched+0x15/0x40 ? rtnl_calcit.isra.30+0x110/0x110 netlink_rcv_skb+0x4d/0x130 netlink_unicast+0x1a3/0x250 netlink_sendmsg+0x2ae/0x3a0 sock_sendmsg+0x36/0x40 ___sys_sendmsg+0x26f/0x2d0 ? do_wp_page+0x8e/0x5f0 ? handle_pte_fault+0x6c3/0xf50 ? __handle_mm_fault+0x38e/0x520 ? __sys_sendmsg+0x5e/0xa0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f75a4583ba0 Code: c3 48 8b 05 f2 62 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d fd c3 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ae cc 00 00 48 89 04 24 RSP: 002b:00007fff60ee7418 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fff60ee7540 RCX: 00007f75a4583ba0 RDX: 0000000000000000 RSI: 00007fff60ee7490 RDI: 0000000000000003 RBP: 000000005b842d3e R08: 0000000000000002 R09: 0000000000000000 R10: 00007fff60ee6ea0 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fff60ee7554 R14: 0000000000000001 R15: 000000000066c100 Modules linked in: act_pedit(E) ip6table_filter ip6_tables iptable_filter binfmt_misc crct10dif_pclmul ext4 crc32_pclmul mbcache ghash_clmulni_intel jbd2 pcbc snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd snd_timer cryptd glue_helper snd joydev pcspkr soundcore virtio_balloon i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi virtio_net net_failover virtio_blk virtio_console failover qxl crc32c_intel drm_kms_helper syscopyarea serio_raw sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix virtio_pci libata virtio_ring i2c_core virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_pedit] CR2: 0000000000000000 Like it's done for other TC actions, give up dumping pedit rules and return an error if nla_nest_start() returns NULL. Fixes: 71d0ed7079df ("net/act_pedit: Support using offset relative to the conventional network headers") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 107034070019..ad99a99f11f6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -109,16 +109,18 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, { struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX); + if (!keys_start) + goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX); + if (!key_start) + goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || - nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) { - nlmsg_trim(skb, keys_start); - return -EINVAL; - } + nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) + goto nla_failure; nla_nest_end(skb, key_start); @@ -128,6 +130,9 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, nla_nest_end(skb, keys_start); return 0; +nla_failure: + nla_nest_cancel(skb, keys_start); + return -EINVAL; } static int tcf_pedit_init(struct net *net, struct nlattr *nla, @@ -418,7 +423,10 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { - tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); + if (tcf_pedit_key_ex_dump(skb, + p->tcfp_keys_ex, + p->tcfp_nkeys)) + goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; From afe49de44c27a89e8e9631c44b5ffadf6ace65e2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 28 Aug 2018 13:40:51 +0200 Subject: [PATCH 39/91] ipv6: fix cleanup ordering for ip6_mr failure Commit 15e668070a64 ("ipv6: reorder icmpv6_init() and ip6_mr_init()") moved the cleanup label for ipmr_fail, but should have changed the contents of the cleanup labels as well. Now we can end up cleaning up icmpv6 even though it hasn't been initialized (jump to icmp_fail or ipmr_fail). Simply undo things in the reverse order of their initialization. Example of panic (triggered by faking a failure of icmpv6_init): kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI [...] RIP: 0010:__list_del_entry_valid+0x79/0x160 [...] Call Trace: ? lock_release+0x8a0/0x8a0 unregister_pernet_operations+0xd4/0x560 ? ops_free_list+0x480/0x480 ? down_write+0x91/0x130 ? unregister_pernet_subsys+0x15/0x30 ? down_read+0x1b0/0x1b0 ? up_read+0x110/0x110 ? kmem_cache_create_usercopy+0x1b4/0x240 unregister_pernet_subsys+0x1d/0x30 icmpv6_cleanup+0x1d/0x30 inet6_init+0x1b5/0x23f Fixes: 15e668070a64 ("ipv6: reorder icmpv6_init() and ip6_mr_init()") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 673bba31eb18..e5da133c6932 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1113,11 +1113,11 @@ static int __init inet6_init(void) igmp_fail: ndisc_cleanup(); ndisc_fail: - ip6_mr_cleanup(); -icmp_fail: - unregister_pernet_subsys(&inet6_net_ops); -ipmr_fail: icmpv6_cleanup(); +icmp_fail: + ip6_mr_cleanup(); +ipmr_fail: + unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); From a03dc36bdca6b614651fedfcd8559cf914d2d21d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 28 Aug 2018 13:40:52 +0200 Subject: [PATCH 40/91] ipv6: fix cleanup ordering for pingv6 registration Commit 6d0bfe226116 ("net: ipv6: Add IPv6 support to the ping socket.") contains an error in the cleanup path of inet6_init(): when proto_register(&pingv6_prot, 1) fails, we try to unregister &pingv6_prot. When rawv6_init() fails, we skip unregistering &pingv6_prot. Example of panic (triggered by faking a failure of proto_register(&pingv6_prot, 1)): general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI [...] RIP: 0010:__list_del_entry_valid+0x79/0x160 [...] Call Trace: proto_unregister+0xbb/0x550 ? trace_preempt_on+0x6f0/0x6f0 ? sock_no_shutdown+0x10/0x10 inet6_init+0x153/0x1b8 Fixes: 6d0bfe226116 ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e5da133c6932..9a4261e50272 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -938,14 +938,14 @@ static int __init inet6_init(void) err = proto_register(&pingv6_prot, 1); if (err) - goto out_unregister_ping_proto; + goto out_unregister_raw_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ err = rawv6_init(); if (err) - goto out_unregister_raw_proto; + goto out_unregister_ping_proto; /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) From f707ef61e17261f2bb18c3e4871c6f135ab3aba9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 28 Aug 2018 13:40:53 +0200 Subject: [PATCH 41/91] net: rtnl: return early from rtnl_unregister_all when protocol isn't registered rtnl_unregister_all(PF_INET6) gets called from inet6_init in cases when no handler has been registered for PF_INET6 yet, for example if ip6_mr_init() fails. Abort and avoid a NULL pointer deref in that case. Example of panic (triggered by faking a failure of register_pernet_subsys): general protection fault: 0000 [#1] PREEMPT SMP KASAN PTI [...] RIP: 0010:rtnl_unregister_all+0x17e/0x2a0 [...] Call Trace: ? rtnetlink_net_init+0x250/0x250 ? sock_unregister+0x103/0x160 ? kernel_getsockopt+0x200/0x200 inet6_init+0x197/0x20d Fixes: e2fddf5e96df ("[IPV6]: Make af_inet6 to check ip6_route_init return value.") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 24431e578310..60c928894a78 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -324,6 +324,10 @@ void rtnl_unregister_all(int protocol) rtnl_lock(); tab = rtnl_msg_handlers[protocol]; + if (!tab) { + rtnl_unlock(); + return; + } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = tab[msgindex]; From c305660b325463cdf3d944492f96155dc931b448 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Tue, 28 Aug 2018 10:19:20 -0500 Subject: [PATCH 42/91] net: stmmac: build the dwmac-socfpga platform driver for Stratix10 The Stratix10 SoC is an AARCH64 based platform that shares the same ethernet controller that is on other SoCFPGA platforms. Build the platform driver. Signed-off-by: Dinh Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index bf4acebb6bcd..324049eebb9b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -110,7 +110,7 @@ config DWMAC_ROCKCHIP config DWMAC_SOCFPGA tristate "SOCFPGA dwmac support" - default ARCH_SOCFPGA + default (ARCH_SOCFPGA || ARCH_STRATIX10) depends on OF && (ARCH_SOCFPGA || ARCH_STRATIX10 || COMPILE_TEST) select MFD_SYSCON help From c3c397c1f16c51601a3fac4fe0c63ad8aa85a904 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Tue, 28 Aug 2018 12:33:15 -0700 Subject: [PATCH 43/91] net: bcmgenet: use MAC link status for fixed phy When using the fixed PHY with GENET (e.g. MOCA) the PHY link status can be determined from the internal link status captured by the MAC. This allows the PHY state machine to use the correct link state with the fixed PHY even if MAC link event interrupts are missed when the net device is opened. Fixes: 8d88c6ebb34c ("net: bcmgenet: enable MoCA link state change detection") Signed-off-by: Doug Berger Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 3 +++ drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index b773bc07edf7..14b49612aa86 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -186,6 +186,9 @@ struct bcmgenet_mib_counters { #define UMAC_MAC1 0x010 #define UMAC_MAX_FRAME_LEN 0x014 +#define UMAC_MODE 0x44 +#define MODE_LINK_STATUS (1 << 5) + #define UMAC_EEE_CTRL 0x064 #define EN_LPI_RX_PAUSE (1 << 0) #define EN_LPI_TX_PFC (1 << 1) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 5333274a283c..4241ae928d4a 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -115,8 +115,14 @@ void bcmgenet_mii_setup(struct net_device *dev) static int bcmgenet_fixed_phy_link_update(struct net_device *dev, struct fixed_phy_status *status) { - if (dev && dev->phydev && status) - status->link = dev->phydev->link; + struct bcmgenet_priv *priv; + u32 reg; + + if (dev && dev->phydev && status) { + priv = netdev_priv(dev); + reg = bcmgenet_umac_readl(priv, UMAC_MODE); + status->link = !!(reg & MODE_LINK_STATUS); + } return 0; } From c4053ef322081554765e1b708d6cdd8855e1d72d Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 29 Aug 2018 09:44:39 +0300 Subject: [PATCH 44/91] net: mvpp2: initialize port of_node pointer Without a valid of_node in struct device we can't find the mvpp2 port device by its DT node. Specifically, this breaks of_find_net_device_by_node(). For example, the Armada 8040 based Clearfog GT-8K uses Marvell 88E6141 switch connected to the &cp1_eth2 port: &cp1_mdio { ... switch0: switch0@4 { compatible = "marvell,mv88e6085"; ... ports { ... port@5 { reg = <5>; label = "cpu"; ethernet = <&cp1_eth2>; }; }; }; }; Without this patch, dsa_register_switch() returns -EPROBE_DEFER because of_find_net_device_by_node() can't find the device_node of the &cp1_eth2 device. Reviewed-by: Andrew Lunn Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 32d785b616e1..28500417843e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4803,6 +4803,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, dev->min_mtu = ETH_MIN_MTU; /* 9704 == 9728 - 20 and rounding to 8 */ dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE; + dev->dev.of_node = port_node; /* Phylink isn't used w/ ACPI as of now */ if (port_node) { From 97763dc0f4010bc20e2969a6bf9a40a2551c4f79 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 29 Aug 2018 10:22:33 +0200 Subject: [PATCH 45/91] net_sched: reject unknown tcfa_action values After the commit 802bfb19152c ("net/sched: user-space can't set unknown tcfa_action values"), unknown tcfa_action values are converted to TC_ACT_UNSPEC, but the common agreement is instead rejecting such configurations. This change also introduces a helper to simplify the destruction of a single action, avoiding code duplication. v1 -> v2: - helper is now static and renamed according to act_* convention - updated extack message, according to the new behavior Fixes: 802bfb19152c ("net/sched: user-space can't set unknown tcfa_action values") Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index db83dac1e7f4..316c98bb87e4 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -662,6 +662,13 @@ int tcf_action_destroy(struct tc_action *actions[], int bind) return ret; } +static int tcf_action_destroy_1(struct tc_action *a, int bind) +{ + struct tc_action *actions[] = { a, NULL }; + + return tcf_action_destroy(actions, bind); +} + static int tcf_action_put(struct tc_action *p) { return __tcf_action_put(p, false); @@ -881,17 +888,16 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - struct tc_action *actions[] = { a, NULL }; - - tcf_action_destroy(actions, bind); + tcf_action_destroy_1(a, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } } if (!tcf_action_valid(a->tcfa_action)) { - NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead"); - a->tcfa_action = TC_ACT_UNSPEC; + tcf_action_destroy_1(a, bind); + NL_SET_ERR_MSG(extack, "Invalid control action value"); + return ERR_PTR(-EINVAL); } return a; From 25a8238f4cc8425d4aade4f9041be468d0e8aa2e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 29 Aug 2018 10:22:34 +0200 Subject: [PATCH 46/91] tc-testing: add test-cases for numeric and invalid control action Only the police action allows us to specify an arbitrary numeric value for the control action. This change introduces an explicit test case for the above feature and then leverage it for testing the kernel behavior for invalid control actions (reject). Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/police.json | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index f03763d81617..30f9b54bd666 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -312,6 +312,54 @@ "$TC actions flush action police" ] }, + { + "id": "6aaf", + "name": "Add police actions with conform-exceed control pass/pipe [with numeric values]", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 0/3 index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions get action police index 1", + "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action pass/pipe", + "matchCount": "1", + "teardown": [ + "$TC actions flush action police" + ] + }, + { + "id": "29b1", + "name": "Add police actions with conform-exceed control /drop", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 10/drop index 1", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action ", + "matchCount": "0", + "teardown": [ + "$TC actions flush action police" + ] + }, { "id": "c26f", "name": "Add police action with invalid peakrate value", From 4f0223bfe9c3e62d8f45a85f1ef1b18a8a263ef9 Mon Sep 17 00:00:00 2001 From: Arunk Khandavalli Date: Thu, 30 Aug 2018 00:40:16 +0300 Subject: [PATCH 47/91] cfg80211: nl80211_update_ft_ies() to validate NL80211_ATTR_IE nl80211_update_ft_ies() tried to validate NL80211_ATTR_IE with is_valid_ie_attr() before dereferencing it, but that helper function returns true in case of NULL pointer (i.e., attribute not included). This can result to dereferencing a NULL pointer. Fix that by explicitly checking that NL80211_ATTR_IE is included. Fixes: 355199e02b83 ("cfg80211: Extend support for IEEE 802.11r Fast BSS Transition") Signed-off-by: Arunk Khandavalli Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ce0149a86c13..733ccf867972 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12099,6 +12099,7 @@ static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_MDID] || + !info->attrs[NL80211_ATTR_IE] || !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; From 1eb507903665442360a959136dfa3234c43db085 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 29 Aug 2018 21:03:25 +0200 Subject: [PATCH 48/91] mac80211: do not convert to A-MSDU if frag/subframe limited Do not start to aggregate packets in a A-MSDU frame (converting the first subframe to A-MSDU, adding the header) if max_tx_fragments or max_amsdu_subframes limits are already exceeded by it. In particular, this happens when drivers set the limit to 1 to avoid A-MSDUs at all. Signed-off-by: Lorenzo Bianconi [reword commit message to be more precise] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 667a73d6eb5c..1aac5e3c7eee 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3208,9 +3208,6 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (skb->len + head->len > max_amsdu_len) goto out; - if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) - goto out; - nfrags = 1 + skb_shinfo(skb)->nr_frags; nfrags += 1 + skb_shinfo(head)->nr_frags; frag_tail = &skb_shinfo(head)->frag_list; @@ -3226,6 +3223,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (max_frags && nfrags > max_frags) goto out; + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) + goto out; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len From aa58acf325b4aadeecae2bfc90658273b47dbace Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Aug 2018 10:55:49 +0200 Subject: [PATCH 49/91] mac80211: always account for A-MSDU header changes In the error path of changing the SKB headroom of the second A-MSDU subframe, we would not account for the already-changed length of the first frame that just got converted to be in A-MSDU format and thus is a bit longer now. Fix this by doing the necessary accounting. It would be possible to reorder the operations, but that would make the code more complex (to calculate the necessary pad), and the headroom expansion should not fail frequently enough to make that worthwhile. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Johannes Berg Acked-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1aac5e3c7eee..6ca0865de945 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3239,7 +3239,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2 + pad)) - goto out; + goto out_recalc; ret = true; data = skb_push(skb, ETH_ALEN + 2); @@ -3256,11 +3256,13 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, head->data_len += skb->len; *frag_tail = skb; - flow->backlog += head->len - orig_len; - tin->backlog_bytes += head->len - orig_len; - - fq_recalc_backlog(fq, tin, flow); +out_recalc: + if (head->len != orig_len) { + flow->backlog += head->len - orig_len; + tin->backlog_bytes += head->len - orig_len; + fq_recalc_backlog(fq, tin, flow); + } out: spin_unlock_bh(&fq->lock); From 3a7ad0634f0986d807772ba74f66f7c3a73612e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Aug 2018 11:50:12 -0700 Subject: [PATCH 50/91] Revert "packet: switch kvzalloc to allocate memory" This reverts commit 71e41286203c017d24f041a7cd71abea7ca7b1e0. mmap()/munmap() can not be backed by kmalloced pages : We fault in : VM_BUG_ON_PAGE(PageSlab(page), page); unmap_single_vma+0x8a/0x110 unmap_vmas+0x4b/0x90 unmap_region+0xc9/0x140 do_munmap+0x274/0x360 vm_munmap+0x81/0xc0 SyS_munmap+0x2b/0x40 do_syscall_64+0x13e/0x1c0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: 71e41286203c ("packet: switch kvzalloc to allocate memory") Signed-off-by: Eric Dumazet Reported-by: John Sperbeck Bisected-by: John Sperbeck Cc: Zhang Yu Cc: Li RongQing Signed-off-by: David S. Miller --- net/packet/af_packet.c | 44 +++++++++++++++++++++++++++++------------- net/packet/internal.h | 1 + 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5610061e7f2e..75c92a87e7b2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4137,36 +4137,52 @@ static const struct vm_operations_struct packet_mmap_ops = { .close = packet_mm_close, }; -static void free_pg_vec(struct pgv *pg_vec, unsigned int len) +static void free_pg_vec(struct pgv *pg_vec, unsigned int order, + unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { - kvfree(pg_vec[i].buffer); + if (is_vmalloc_addr(pg_vec[i].buffer)) + vfree(pg_vec[i].buffer); + else + free_pages((unsigned long)pg_vec[i].buffer, + order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } -static char *alloc_one_pg_vec_page(unsigned long size) +static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | + __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; - buffer = kvzalloc(size, GFP_KERNEL); + buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; - buffer = kvzalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + /* __get_free_pages failed, fall back to vmalloc */ + buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); + if (buffer) + return buffer; - return buffer; + /* vmalloc failed, lets dig into swap here */ + gfp_flags &= ~__GFP_NORETRY; + buffer = (char *) __get_free_pages(gfp_flags, order); + if (buffer) + return buffer; + + /* complete and utter failure */ + return NULL; } -static struct pgv *alloc_pg_vec(struct tpacket_req *req) +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; - unsigned long size = req->tp_block_size; struct pgv *pg_vec; int i; @@ -4175,7 +4191,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req) goto out; for (i = 0; i < block_nr; i++) { - pg_vec[i].buffer = alloc_one_pg_vec_page(size); + pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } @@ -4184,7 +4200,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req) return pg_vec; out_free_pgvec: - free_pg_vec(pg_vec, block_nr); + free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } @@ -4194,9 +4210,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); + int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; - int was_running; __be16 num; int err = -EINVAL; /* Added to avoid minimal code churn */ @@ -4258,7 +4274,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; err = -ENOMEM; - pg_vec = alloc_pg_vec(req); + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { @@ -4312,6 +4329,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); + swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; @@ -4337,7 +4355,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } if (pg_vec) - free_pg_vec(pg_vec, req->tp_block_nr); + free_pg_vec(pg_vec, order, req->tp_block_nr); out: return err; } diff --git a/net/packet/internal.h b/net/packet/internal.h index 8f50036f62f0..3bb7c5fb3bff 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -64,6 +64,7 @@ struct packet_ring_buffer { unsigned int frame_size; unsigned int frame_max; + unsigned int pg_vec_order; unsigned int pg_vec_pages; unsigned int pg_vec_len; From 9ad716b95fd6c6be46a4f2d5936e514b5bcd744d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Aug 2018 12:46:08 -0700 Subject: [PATCH 51/91] nfp: wait for posted reconfigs when disabling the device To avoid leaking a running timer we need to wait for the posted reconfigs after netdev is unregistered. In common case the process of deinitializing the device will perform synchronous reconfigs which wait for posted requests, but especially with VXLAN ports being actively added and removed there can be a race condition leaving a timer running after adapter structure is freed leading to a crash. Add an explicit flush after deregistering and for a good measure a warning to check if timer is running just before structures are freed. Fixes: 3d780b926a12 ("nfp: add async reconfiguration mechanism") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/nfp_net_common.c | 66 ++++++++++++------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index a8b9fbab5f73..253bdaef1505 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -229,6 +229,45 @@ static void nfp_net_reconfig_post(struct nfp_net *nn, u32 update) spin_unlock_bh(&nn->reconfig_lock); } +static void nfp_net_reconfig_sync_enter(struct nfp_net *nn) +{ + bool cancelled_timer = false; + u32 pre_posted_requests; + + spin_lock_bh(&nn->reconfig_lock); + + nn->reconfig_sync_present = true; + + if (nn->reconfig_timer_active) { + nn->reconfig_timer_active = false; + cancelled_timer = true; + } + pre_posted_requests = nn->reconfig_posted; + nn->reconfig_posted = 0; + + spin_unlock_bh(&nn->reconfig_lock); + + if (cancelled_timer) { + del_timer_sync(&nn->reconfig_timer); + nfp_net_reconfig_wait(nn, nn->reconfig_timer.expires); + } + + /* Run the posted reconfigs which were issued before we started */ + if (pre_posted_requests) { + nfp_net_reconfig_start(nn, pre_posted_requests); + nfp_net_reconfig_wait(nn, jiffies + HZ * NFP_NET_POLL_TIMEOUT); + } +} + +static void nfp_net_reconfig_wait_posted(struct nfp_net *nn) +{ + nfp_net_reconfig_sync_enter(nn); + + spin_lock_bh(&nn->reconfig_lock); + nn->reconfig_sync_present = false; + spin_unlock_bh(&nn->reconfig_lock); +} + /** * nfp_net_reconfig() - Reconfigure the firmware * @nn: NFP Net device to reconfigure @@ -242,32 +281,9 @@ static void nfp_net_reconfig_post(struct nfp_net *nn, u32 update) */ int nfp_net_reconfig(struct nfp_net *nn, u32 update) { - bool cancelled_timer = false; - u32 pre_posted_requests; int ret; - spin_lock_bh(&nn->reconfig_lock); - - nn->reconfig_sync_present = true; - - if (nn->reconfig_timer_active) { - del_timer(&nn->reconfig_timer); - nn->reconfig_timer_active = false; - cancelled_timer = true; - } - pre_posted_requests = nn->reconfig_posted; - nn->reconfig_posted = 0; - - spin_unlock_bh(&nn->reconfig_lock); - - if (cancelled_timer) - nfp_net_reconfig_wait(nn, nn->reconfig_timer.expires); - - /* Run the posted reconfigs which were issued before we started */ - if (pre_posted_requests) { - nfp_net_reconfig_start(nn, pre_posted_requests); - nfp_net_reconfig_wait(nn, jiffies + HZ * NFP_NET_POLL_TIMEOUT); - } + nfp_net_reconfig_sync_enter(nn); nfp_net_reconfig_start(nn, update); ret = nfp_net_reconfig_wait(nn, jiffies + HZ * NFP_NET_POLL_TIMEOUT); @@ -3633,6 +3649,7 @@ struct nfp_net *nfp_net_alloc(struct pci_dev *pdev, bool needs_netdev, */ void nfp_net_free(struct nfp_net *nn) { + WARN_ON(timer_pending(&nn->reconfig_timer) || nn->reconfig_posted); if (nn->dp.netdev) free_netdev(nn->dp.netdev); else @@ -3920,4 +3937,5 @@ void nfp_net_clean(struct nfp_net *nn) return; unregister_netdev(nn->dp.netdev); + nfp_net_reconfig_wait_posted(nn); } From e04e7a7bbd4bbabef4e1a58367e5fc9b2edc3b10 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 30 Aug 2018 05:42:13 +0000 Subject: [PATCH 52/91] hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe() This patch fixes the race between netvsc_probe() and rndis_set_subchannel(), which can cause a deadlock. These are the related 3 paths which show the deadlock: path #1: Workqueue: hv_vmbus_con vmbus_onmessage_work [hv_vmbus] Call Trace: schedule schedule_preempt_disabled __mutex_lock __device_attach bus_probe_device device_add vmbus_device_register vmbus_onoffer vmbus_onmessage_work process_one_work worker_thread kthread ret_from_fork path #2: schedule schedule_preempt_disabled __mutex_lock netvsc_probe vmbus_probe really_probe __driver_attach bus_for_each_dev driver_attach_async async_run_entry_fn process_one_work worker_thread kthread ret_from_fork path #3: Workqueue: events netvsc_subchan_work [hv_netvsc] Call Trace: schedule rndis_set_subchannel netvsc_subchan_work process_one_work worker_thread kthread ret_from_fork Before path #1 finishes, path #2 can start to run, because just before the "bus_probe_device(dev);" in device_add() in path #1, there is a line "object_uevent(&dev->kobj, KOBJ_ADD);", so systemd-udevd can immediately try to load hv_netvsc and hence path #2 can start to run. Next, path #2 offloads the subchannal's initialization to a workqueue, i.e. path #3, so we can end up in a deadlock situation like this: Path #2 gets the device lock, and is trying to get the rtnl lock; Path #3 gets the rtnl lock and is waiting for all the subchannel messages to be processed; Path #1 is trying to get the device lock, but since #2 is not releasing the device lock, path #1 has to sleep; since the VMBus messages are processed one by one, this means the sub-channel messages can't be procedded, so #3 has to sleep with the rtnl lock held, and finally #2 has to sleep... Now all the 3 paths are sleeping and we hit the deadlock. With the patch, we can make sure #2 gets both the device lock and the rtnl lock together, gets its job done, and releases the locks, so #1 and #3 will not be blocked for ever. Fixes: 8195b1396ec8 ("hv_netvsc: fix deadlock on hotplug") Signed-off-by: Dexuan Cui Cc: Stephen Hemminger Cc: K. Y. Srinivasan Cc: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 1121a1ec407c..70921bbe0e28 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2206,6 +2206,16 @@ static int netvsc_probe(struct hv_device *dev, memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN); + /* We must get rtnl lock before scheduling nvdev->subchan_work, + * otherwise netvsc_subchan_work() can get rtnl lock first and wait + * all subchannels to show up, but that may not happen because + * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer() + * -> ... -> device_add() -> ... -> __device_attach() can't get + * the device lock, so all the subchannels can't be processed -- + * finally netvsc_subchan_work() hangs for ever. + */ + rtnl_lock(); + if (nvdev->num_chn > 1) schedule_work(&nvdev->subchan_work); @@ -2224,7 +2234,6 @@ static int netvsc_probe(struct hv_device *dev, else net->max_mtu = ETH_DATA_LEN; - rtnl_lock(); ret = register_netdevice(net); if (ret != 0) { pr_err("Unable to register netdev.\n"); From b0e0b0abbd5e52568e3848b0ba54a9efa3a11547 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 30 Aug 2018 13:30:03 +0200 Subject: [PATCH 53/91] net/rds: RDS is not Radio Data System Getting prompt "The RDS Protocol" (RDS) is not too helpful, and it is easily confused with Radio Data System (which we may want to support in kernel, too). Signed-off-by: Pavel Machek Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 01b3bd6a3708..b9092111bc45 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,6 +1,6 @@ config RDS - tristate "The RDS Protocol" + tristate "The Reliable Datagram Sockets Protocol" depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, From 63cc357f7bba6729869565a12df08441a5995d9a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Aug 2018 14:24:29 +0200 Subject: [PATCH 54/91] tcp: do not restart timewait timer on rst reception RFC 1337 says: ''Ignore RST segments in TIME-WAIT state. If the 2 minute MSL is enforced, this fix avoids all three hazards.'' So with net.ipv4.tcp_rfc1337=1, expected behaviour is to have TIME-WAIT sk expire rather than removing it instantly when a reset is received. However, Linux will also re-start the TIME-WAIT timer. This causes connect to fail when tying to re-use ports or very long delays (until syn retry interval exceeds MSL). packetdrill test case: // Demonstrate bogus rearming of TIME-WAIT timer in rfc1337 mode. `sysctl net.ipv4.tcp_rfc1337=1` 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < S 0:0(0) win 29200 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 // Receive first segment 0.310 < P. 1:1001(1000) ack 1 win 46 // Send one ACK 0.310 > . 1:1(0) ack 1001 // read 1000 byte 0.310 read(4, ..., 1000) = 1000 // Application writes 100 bytes 0.350 write(4, ..., 100) = 100 0.350 > P. 1:101(100) ack 1001 // ACK 0.500 < . 1001:1001(0) ack 101 win 257 // close the connection 0.600 close(4) = 0 0.600 > F. 101:101(0) ack 1001 win 244 // Our side is in FIN_WAIT_1 & waits for ack to fin 0.7 < . 1001:1001(0) ack 102 win 244 // Our side is in FIN_WAIT_2 with no outstanding data. 0.8 < F. 1001:1001(0) ack 102 win 244 0.8 > . 102:102(0) ack 1002 win 244 // Our side is now in TIME_WAIT state, send ack for fin. 0.9 < F. 1002:1002(0) ack 102 win 244 0.9 > . 102:102(0) ack 1002 win 244 // Peer reopens with in-window SYN: 1.000 < S 1000:1000(0) win 9200 // Therefore, reply with ACK. 1.000 > . 102:102(0) ack 1002 win 244 // Peer sends RST for this ACK. Normally this RST results // in tw socket removal, but rfc1337=1 setting prevents this. 1.100 < R 1002:1002(0) win 244 // second syn. Due to rfc1337=1 expect another pure ACK. 31.0 < S 1000:1000(0) win 9200 31.0 > . 102:102(0) ack 1002 win 244 // .. and another RST from peer. 31.1 < R 1002:1002(0) win 244 31.2 `echo no timer restart;ss -m -e -a -i -n -t -o state TIME-WAIT` // third syn after one minute. Time-Wait socket should have expired by now. 63.0 < S 1000:1000(0) win 9200 // so we expect a syn-ack & 3whs to proceed from here on. 63.0 > S. 0:0(0) ack 1 Without this patch, 'ss' shows restarts of tw timer and last packet is thus just another pure ack, more than one minute later. This restores the original code from commit 283fd6cf0be690a83 ("Merge in ANK networking jumbo patch") in netdev-vger-cvs.git . For some reason the else branch was removed/lost in 1f28b683339f7 ("Merge in TCP/UDP optimizations and [..]") and timer restart became unconditional. Reported-by: Michal Tesar Signed-off-by: Florian Westphal Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 75ef332a7caf..12affb7864d9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -184,8 +184,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } + } else { + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; From 902b5417f28d955cdb4898df6ffaab15f56c5cff Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 30 Aug 2018 16:01:17 +0200 Subject: [PATCH 55/91] selftests: pmtu: maximum MTU for vti4 is 2^16-1-20 Since commit 82612de1c98e ("ip_tunnel: restore binding to ifaces with a large mtu"), the maximum MTU for vti4 is based on IP_MAX_MTU instead of the mysterious constant 0xFFF8. This makes this selftest fail. Fixes: 82612de1c98e ("ip_tunnel: restore binding to ifaces with a large mtu") Signed-off-by: Sabrina Dubroca Acked-by: Stefano Brivio Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index f8cc38afffa2..0ecf2609b9a4 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -334,7 +334,7 @@ test_pmtu_vti4_link_add_mtu() { fail=0 min=68 - max=$((65528 - 20)) + max=$((65535 - 20)) # Check invalid values first for v in $((min - 1)) $((max + 1)); do ${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10 2>/dev/null From c81c7012e0c769b5704c2b07bd5224965e76fb70 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 30 Aug 2018 16:01:18 +0200 Subject: [PATCH 56/91] selftests: pmtu: detect correct binary to ping ipv6 addresses Some systems don't have the ping6 binary anymore, and use ping for everything. Detect the absence of ping6 and try to use ping instead. Fixes: d1f1b9cbf34c ("selftests: net: Introduce first PMTU test") Signed-off-by: Sabrina Dubroca Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 0ecf2609b9a4..32a194e3e07a 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -46,6 +46,9 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +# Some systems don't have a ping6 binary anymore +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + tests=" pmtu_vti6_exception vti6: PMTU exceptions pmtu_vti4_exception vti4: PMTU exceptions @@ -274,7 +277,7 @@ test_pmtu_vti6_exception() { mtu "${ns_b}" veth_b 4000 mtu "${ns_a}" vti6_a 5000 mtu "${ns_b}" vti6_b 5000 - ${ns_a} ping6 -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null + ${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null # Check that exception was created if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" = "" ]; then From f611a5b4a51fa36a0aa792be474f5d6aacaef7e3 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Thu, 30 Aug 2018 13:19:53 -0500 Subject: [PATCH 57/91] ibmvnic: Include missing return code checks in reset function Check the return codes of these functions and halt reset in case of failure. The driver will remain in a dormant state until the next reset event, when device initialization will be re-attempted. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index dafdd4ade705..4f0daf67b18d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1823,11 +1823,17 @@ static int do_reset(struct ibmvnic_adapter *adapter, adapter->map_id = 1; release_rx_pools(adapter); release_tx_pools(adapter); - init_rx_pools(netdev); - init_tx_pools(netdev); + rc = init_rx_pools(netdev); + if (rc) + return rc; + rc = init_tx_pools(netdev); + if (rc) + return rc; release_napi(adapter); - init_napi(adapter); + rc = init_napi(adapter); + if (rc) + return rc; } else { rc = reset_tx_pools(adapter); if (rc) From 93bbadd6e0a2a58e49d265b9b1aa58e621b60a26 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 30 Aug 2018 19:11:24 +0300 Subject: [PATCH 58/91] ipv6: don't get lwtstate twice in ip6_rt_copy_init() Commit 80f1a0f4e0cd ("net/ipv6: Put lwtstate when destroying fib6_info") partially fixed the kmemleak [1], lwtstate can be copied from fib6_info, with ip6_rt_copy_init(), and it should be done only once there. rt->dst.lwtstate is set by ip6_rt_init_dst(), at the start of the function ip6_rt_copy_init(), so there is no need to get it again at the end. With this patch, lwtstate also isn't copied from RTF_REJECT routes. [1]: unreferenced object 0xffff880b6aaa14e0 (size 64): comm "ip", pid 10577, jiffies 4295149341 (age 1273.903s) hex dump (first 32 bytes): 01 00 04 00 04 00 00 00 10 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000018664623>] lwtunnel_build_state+0x1bc/0x420 [<00000000b73aa29a>] ip6_route_info_create+0x9f7/0x1fd0 [<00000000ee2c5d1f>] ip6_route_add+0x14/0x70 [<000000008537b55c>] inet6_rtm_newroute+0xd9/0xe0 [<000000002acc50f5>] rtnetlink_rcv_msg+0x66f/0x8e0 [<000000008d9cd381>] netlink_rcv_skb+0x268/0x3b0 [<000000004c893c76>] netlink_unicast+0x417/0x5a0 [<00000000f2ab1afb>] netlink_sendmsg+0x70b/0xc30 [<00000000890ff0aa>] sock_sendmsg+0xb1/0xf0 [<00000000a2e7b66f>] ___sys_sendmsg+0x659/0x950 [<000000001e7426c8>] __sys_sendmsg+0xde/0x170 [<00000000fe411443>] do_syscall_64+0x9f/0x4a0 [<000000001be7b28b>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000006d21f353>] 0xffffffffffffffff Fixes: 6edb3c96a5f0 ("net/ipv6: Defer initialization of dst to data path") Signed-off-by: Alexey Kodanev Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4ea13e8360b..18e00ce1719a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -996,7 +996,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) rt->rt6i_src = ort->fib6_src; #endif rt->rt6i_prefsrc = ort->fib6_prefsrc; - rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, From 9db39f4d4f94b61e4b64b077f6ddb2bdfb533a88 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 31 Aug 2018 23:45:16 +0200 Subject: [PATCH 59/91] bpf: Fix bpf_msg_pull_data() Helper bpf_msg_pull_data() mistakenly reuses variable 'offset' while linearizing multiple scatterlist elements. Variable 'offset' is used to find first starting scatterlist element i.e. msg->data = sg_virt(&sg[first_sg]) + start - offset" Use different variable name while linearizing multiple scatterlist elements so that value contained in variable 'offset' won't get overwritten. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Tushar Dave Signed-off-by: Daniel Borkmann --- net/core/filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 2c7801f6737a..aecdeba052d3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2292,7 +2292,7 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0; + unsigned int len = 0, offset = 0, copy = 0, poffset = 0; int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; @@ -2348,16 +2348,15 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(!page)) return -ENOMEM; p = page_address(page); - offset = 0; i = first_sg; do { from = sg_virt(&sg[i]); len = sg[i].length; - to = p + offset; + to = p + poffset; memcpy(to, from, len); - offset += len; + poffset += len; sg[i].length = 0; put_page(sg_page(&sg[i])); From 97911e0ccb54c7478b9dac77e6c54c01b449e3a7 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 31 Aug 2018 15:32:42 +0900 Subject: [PATCH 60/91] tools/bpf: bpftool, add xskmap in map types When listed all maps, bpftool currently shows (null) for xskmap. Added xskmap type in map_type_name[] to show correct type. Signed-off-by: Prashant Bhole Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b2ec20e562bd..b455930a3eaf 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -68,6 +68,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_DEVMAP] = "devmap", [BPF_MAP_TYPE_SOCKMAP] = "sockmap", [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", [BPF_MAP_TYPE_SOCKHASH] = "sockhash", [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", }; From 597222f72a94118f593e4f32bf58ae7e049a0df1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 30 Aug 2018 21:25:02 -0700 Subject: [PATCH 61/91] bpf: avoid misuse of psock when TCP_ULP_BPF collides with another ULP Currently we check sk_user_data is non NULL to determine if the sk exists in a map. However, this is not sufficient to ensure the psock or the ULP ops are not in use by another user, such as kcm or TLS. To avoid this when adding a sock to a map also verify it is of the correct ULP type. Additionally, when releasing a psock verify that it is the TCP_ULP_BPF type before releasing the ULP. The error case where we abort an update due to ULP collision can cause this error path. For example, __sock_map_ctx_update_elem() [...] err = tcp_set_ulp_id(sock, TCP_ULP_BPF) <- collides with TLS if (err) <- so err out here goto out_free [...] out_free: smap_release_sock() <- calling tcp_cleanup_ulp releases the TLS ULP incorrectly. Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ce63e5801746..488ef9663c01 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1462,10 +1462,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1892,6 +1898,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; From 4fb7253e4f9a8f06a986a3b317e2f79d9b43d552 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 29 Aug 2018 18:06:08 +0800 Subject: [PATCH 62/91] igmp: fix incorrect unsolicit report count when join group We should not start timer if im->unsolicit_count equal to 0 after decrease. Or we will send one more unsolicit report message. i.e. 3 instead of 2 by default. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cf75f8944b05..deb1f8274d71 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -820,10 +820,9 @@ static void igmp_timer_expire(struct timer_list *t) spin_lock(&im->lock); im->tm_running = 0; - if (im->unsolicit_count) { - im->unsolicit_count--; + if (im->unsolicit_count && --im->unsolicit_count) igmp_start_timer(im, unsolicited_report_interval(in_dev)); - } + im->reporter = 1; spin_unlock(&im->lock); From ff06525fcb8ae3c302ac1319bf6c07c026dea964 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 29 Aug 2018 18:06:10 +0800 Subject: [PATCH 63/91] igmp: fix incorrect unsolicit report count after link down and up After link down and up, i.e. when call ip_mc_up(), we doesn't init im->unsolicit_count. So after igmp_timer_expire(), we will not start timer again and only send one unsolicit report at last. Fix it by initializing im->unsolicit_count in igmp_group_added(), so we can respect igmp robustness value. Fixes: 24803f38a5c0b ("igmp: do not remove igmp souce list info when set link down") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index deb1f8274d71..4da39446da2d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1307,6 +1307,8 @@ static void igmp_group_added(struct ip_mc_list *im) if (in_dev->dead) return; + + im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); @@ -1390,9 +1392,6 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) { struct ip_mc_list *im; -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); @@ -1419,7 +1418,6 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST timer_setup(&im->timer, igmp_timer_expire, 0); - im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; From 10d7fac4c52618d94a42d701d28f114147291ecc Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 29 Aug 2018 08:00:23 -0700 Subject: [PATCH 64/91] dt-bindings: net: cpsw: Document cpsw-phy-sel usage but prefer phandle The current cpsw usage for cpsw-phy-sel is undocumented but is used for all the boards using cpsw. And cpsw-phy-sel is not really a child of the cpsw device, it lives in the system control module instead. Let's document the existing usage, and improve it a bit where we prefer to use a phandle instead of a child device for it. That way we can properly describe the hardware in dts files for things like genpd. Cc: devicetree@vger.kernel.org Cc: Andrew Lunn Cc: Grygorii Strashko Cc: Ivan Khoronzhuk Cc: Mark Rutland Cc: Murali Karicheri Cc: Rob Herring Signed-off-by: Tony Lindgren Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/cpsw.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt index 41089369f891..b3acebe08eb0 100644 --- a/Documentation/devicetree/bindings/net/cpsw.txt +++ b/Documentation/devicetree/bindings/net/cpsw.txt @@ -19,6 +19,10 @@ Required properties: - slaves : Specifies number for slaves - active_slave : Specifies the slave to use for time stamping, ethtool and SIOCGMIIPHY +- cpsw-phy-sel : Specifies the phandle to the CPSW phy mode selection + device. See also cpsw-phy-sel.txt for it's binding. + Note that in legacy cases cpsw-phy-sel may be + a child device instead of a phandle. Optional properties: - ti,hwmods : Must be "cpgmac0" @@ -75,6 +79,7 @@ Examples: cpts_clock_mult = <0x80000000>; cpts_clock_shift = <29>; syscon = <&cm>; + cpsw-phy-sel = <&phy_sel>; cpsw_emac0: slave@0 { phy_id = <&davinci_mdio>, <0>; phy-mode = "rgmii-txid"; @@ -103,6 +108,7 @@ Examples: cpts_clock_mult = <0x80000000>; cpts_clock_shift = <29>; syscon = <&cm>; + cpsw-phy-sel = <&phy_sel>; cpsw_emac0: slave@0 { phy_id = <&davinci_mdio>, <0>; phy-mode = "rgmii-txid"; From 18eb8aea7fb2fb4490e578b1b8a1096c34b2fc48 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 29 Aug 2018 08:00:24 -0700 Subject: [PATCH 65/91] net: ethernet: cpsw-phy-sel: prefer phandle for phy sel The cpsw-phy-sel device is not a child of the cpsw interconnect target module. It lives in the system control module. Let's fix this issue by trying to use cpsw-phy-sel phandle first if it exists and if not fall back to current usage of trying to find the cpsw-phy-sel child. That way the phy sel driver can be a child of the system control module where it belongs in the device tree. Without this fix, we cannot have a proper interconnect target module hierarchy in device tree for things like genpd. Note that deferred probe is mostly not supported by cpsw and this patch does not attempt to fix that. In case deferred probe support is needed, this could be added to cpsw_slave_open() and phy_connect() so they start handling and returning errors. For documenting it, looks like the cpsw-phy-sel is used for all cpsw device tree nodes. It's missing the related binding documentation, so let's also update the binding documentation accordingly. Cc: devicetree@vger.kernel.org Cc: Andrew Lunn Cc: Grygorii Strashko Cc: Ivan Khoronzhuk Cc: Mark Rutland Cc: Murali Karicheri Cc: Rob Herring Signed-off-by: Tony Lindgren Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw-phy-sel.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c index 0c1adad7415d..396e1cd10667 100644 --- a/drivers/net/ethernet/ti/cpsw-phy-sel.c +++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c @@ -170,10 +170,13 @@ void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave) struct device_node *node; struct cpsw_phy_sel_priv *priv; - node = of_get_child_by_name(dev->of_node, "cpsw-phy-sel"); + node = of_parse_phandle(dev->of_node, "cpsw-phy-sel", 0); if (!node) { - dev_err(dev, "Phy mode driver DT not found\n"); - return; + node = of_get_child_by_name(dev->of_node, "cpsw-phy-sel"); + if (!node) { + dev_err(dev, "Phy mode driver DT not found\n"); + return; + } } dev = bus_find_device(&platform_bus_type, NULL, node, match); From 15a81b418e22a9aa4a0504471fdcb0f4ebf69b96 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 30 Aug 2018 14:15:43 -0700 Subject: [PATCH 66/91] net/ipv6: Only update MTU metric if it set Jan reported a regression after an update to 4.18.5. In this case ipv6 default route is setup by systemd-networkd based on data from an RA. The RA contains an MTU of 1492 which is used when the route is first inserted but then systemd-networkd pushes down updates to the default route without the mtu set. Prior to the change to fib6_info, metrics such as MTU were held in the dst_entry and rt6i_pmtu in rt6_info contained an update to the mtu if any. ip6_mtu would look at rt6i_pmtu first and use it if set. If not, the value from the metrics is used if it is set and finally falling back to the idev value. After the fib6_info change metrics are contained in the fib6_info struct and there is no equivalent to rt6i_pmtu. To maintain consistency with the old behavior the new code should only reset the MTU in the metrics if the route update has it set. Fixes: d4ead6b34b67 ("net/ipv6: move metrics from dst to rt6_info") Reported-by: Jan Janssen Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c861a6d4671d..5516f55e214b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -989,7 +989,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); - fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); + + if (rt->fib6_pmtu) + fib6_metric_set(iter, RTAX_MTU, + rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, From 16fe10cf92783ed9ceb182d6ea2b8adf5e8ec1b8 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 1 Sep 2018 20:11:05 +0800 Subject: [PATCH 67/91] net: cadence: Fix a sleep-in-atomic-context bug in macb_halt_tx() The kernel module may sleep with holding a spinlock. The function call paths (from bottom to top) in Linux-4.16 are: [FUNC] usleep_range drivers/net/ethernet/cadence/macb_main.c, 648: usleep_range in macb_halt_tx drivers/net/ethernet/cadence/macb_main.c, 730: macb_halt_tx in macb_tx_error_task drivers/net/ethernet/cadence/macb_main.c, 721: _raw_spin_lock_irqsave in macb_tx_error_task To fix this bug, usleep_range() is replaced with udelay(). This bug is found by my static analysis tool DSAC. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index c6707ea2d751..16e4ef7d7185 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -649,7 +649,7 @@ static int macb_halt_tx(struct macb *bp) if (!(status & MACB_BIT(TGO))) return 0; - usleep_range(10, 250); + udelay(250); } while (time_before(halt_time, timeout)); return -ETIMEDOUT; From 59a03fea131d671a57b8ed3dc446264c61d4b75f Mon Sep 17 00:00:00 2001 From: Vinson Lee Date: Sat, 1 Sep 2018 21:20:27 +0000 Subject: [PATCH 68/91] uapi: Fix linux/rds.h userspace compilation errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include linux/in6.h for struct in6_addr. /usr/include/linux/rds.h:156:18: error: field ‘laddr’ has incomplete type struct in6_addr laddr; ^~~~~ /usr/include/linux/rds.h:157:18: error: field ‘faddr’ has incomplete type struct in6_addr faddr; ^~~~~ /usr/include/linux/rds.h:178:18: error: field ‘laddr’ has incomplete type struct in6_addr laddr; ^~~~~ /usr/include/linux/rds.h:179:18: error: field ‘faddr’ has incomplete type struct in6_addr faddr; ^~~~~ /usr/include/linux/rds.h:198:18: error: field ‘bound_addr’ has incomplete type struct in6_addr bound_addr; ^~~~~~~~~~ /usr/include/linux/rds.h:199:18: error: field ‘connected_addr’ has incomplete type struct in6_addr connected_addr; ^~~~~~~~~~~~~~ /usr/include/linux/rds.h:219:18: error: field ‘local_addr’ has incomplete type struct in6_addr local_addr; ^~~~~~~~~~ /usr/include/linux/rds.h:221:18: error: field ‘peer_addr’ has incomplete type struct in6_addr peer_addr; ^~~~~~~~~ /usr/include/linux/rds.h:245:18: error: field ‘src_addr’ has incomplete type struct in6_addr src_addr; ^~~~~~~~ /usr/include/linux/rds.h:246:18: error: field ‘dst_addr’ has incomplete type struct in6_addr dst_addr; ^~~~~~~~ Fixes: b7ff8b1036f0 ("rds: Extend RDS API for IPv6 support") Signed-off-by: Vinson Lee Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index dc520e1a4123..8b73cb603c5f 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -37,6 +37,7 @@ #include #include /* For __kernel_sockaddr_storage. */ +#include /* For struct in6_addr. */ #define RDS_IB_ABI_VERSION 0x301 From 66eb02d839e8495ae6b612e2d09ff599374b80e2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 31 Aug 2018 01:04:13 +0200 Subject: [PATCH 69/91] mac80211: fix an off-by-one issue in A-MSDU max_subframe computation Initialize 'n' to 2 in order to take into account also the first packet in the estimation of max_subframe limit for a given A-MSDU since frag_tail pointer is NULL when ieee80211_amsdu_aggregate routine analyzes the second frame. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6ca0865de945..9b3b069e418a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3174,7 +3174,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, void *data; bool ret = false; unsigned int orig_len; - int n = 1, nfrags, pad = 0; + int n = 2, nfrags, pad = 0; u16 hdrlen; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) From 8442938c3a2177ba16043b3a935f2c78266ad399 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 31 Aug 2018 11:10:55 +0300 Subject: [PATCH 70/91] cfg80211: fix a type issue in ieee80211_chandef_to_operating_class() The "chandef->center_freq1" variable is a u32 but "freq" is a u16 so we are truncating away the high bits. I noticed this bug because in commit 9cf0a0b4b64a ("cfg80211: Add support for 60GHz band channels 5 and 6") we made "freq <= 56160 + 2160 * 6" a valid requency when before it was only "freq <= 56160 + 2160 * 4" that was valid. It introduces a static checker warning: net/wireless/util.c:1571 ieee80211_chandef_to_operating_class() warn: always true condition '(freq <= 56160 + 2160 * 6) => (0-u16max <= 69120)' But really we probably shouldn't have been truncating the high bits away to begin with. Signed-off-by: Dan Carpenter Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index 3c654cd7ba56..908bf5b6d89e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1374,7 +1374,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; - u16 freq = chandef->center_freq1; + u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) From abd76d255d69d70206c01b9cb19ba36a9c1df6a1 Mon Sep 17 00:00:00 2001 From: "Dreyfuss, Haim" Date: Fri, 31 Aug 2018 11:31:04 +0300 Subject: [PATCH 71/91] mac80211: fix WMM TXOP calculation In commit 9236c4523e5b ("mac80211: limit wmm params to comply with ETSI requirements"), we have limited the WMM parameters to comply with 802.11 and ETSI standard. Mistakenly the TXOP value was caluclated wrong. Fix it by taking the minimum between 802.11 to ETSI to make sure we are not violating both. Fixes: e552af058148 ("mac80211: limit wmm params to comply with ETSI requirements") Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/util.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c80187d6e6bb..93b5bb849ad7 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1151,8 +1151,7 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); - qparam->txop = !qparam->txop ? wmm_ac->cot / 32 : - min_t(u16, qparam->txop, wmm_ac->cot / 32); + qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } From f3ffb6c3a28963657eb8b02a795d75f2ebbd5ef4 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 31 Aug 2018 11:31:06 +0300 Subject: [PATCH 72/91] mac80211: fix a race between restart and CSA flows We hit a problem with iwlwifi that was caused by a bug in mac80211. A bug in iwlwifi caused the firwmare to crash in certain cases in channel switch. Because of that bug, drv_pre_channel_switch would fail and trigger the restart flow. Now we had the hw restart worker which runs on the system's workqueue and the csa_connection_drop_work worker that runs on mac80211's workqueue that can run together. This is obviously problematic since the restart work wants to reconfigure the connection, while the csa_connection_drop_work worker does the exact opposite: it tries to disconnect. Fix this by cancelling the csa_connection_drop_work worker in the restart worker. Note that this can sound racy: we could have: driver iface_work CSA_work restart_work +++++++++++++++++++++++++++++++++++++++++++++ | <--drv_cs ---| -CS FAILED--> | | | cancel_work(CSA) schedule | CSA work | | | Race between those 2 But this is not possible because we flush the workqueue in the restart worker before we cancel the CSA worker. That would be bullet proof if we could guarantee that we schedule the CSA worker only from the iface_work which runs on the workqueue (and not on the system's workqueue), but unfortunately we do have an instance in which we schedule the CSA work outside the context of the workqueue (ieee80211_chswitch_done). Note also that we should probably cancel other workers like beacon_connection_loss_work and possibly others for different types of interfaces, at the very least, IBSS should suffer from the exact same problem, but for now, do the minimum to fix the actual bug that was actually experienced and reproduced. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/main.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 27cd64acaf00..66cbddd65b47 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -255,8 +255,27 @@ static void ieee80211_restart_work(struct work_struct *work) flush_work(&local->radar_detected_work); rtnl_lock(); - list_for_each_entry(sdata, &local->interfaces, list) + list_for_each_entry(sdata, &local->interfaces, list) { + /* + * XXX: there may be more work for other vif types and even + * for station mode: a good thing would be to run most of + * the iface type's dependent _stop (ieee80211_mg_stop, + * ieee80211_ibss_stop) etc... + * For now, fix only the specific bug that was seen: race + * between csa_connection_drop_work and us. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + /* + * This worker is scheduled from the iface worker that + * runs on mac80211's workqueue, so we can't be + * scheduling this worker after the cancel right here. + * The exception is ieee80211_chswitch_done. + * Then we can have a race... + */ + cancel_work_sync(&sdata->u.mgd.csa_connection_drop_work); + } flush_delayed_work(&sdata->dec_tailroom_needed_wk); + } ieee80211_scan_cancel(local); /* make sure any new ROC will consider local->in_reconfig */ From 0007e94355fdb71a1cf5dba0754155cba08f0666 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 31 Aug 2018 11:31:10 +0300 Subject: [PATCH 73/91] mac80211: Fix station bandwidth setting after channel switch When performing a channel switch flow for a managed interface, the flow did not update the bandwidth of the AP station and the rate scale algorithm. In case of a channel width downgrade, this would result with the rate scale algorithm using a bandwidth that does not match the interface channel configuration. Fix this by updating the AP station bandwidth and rate scaling algorithm before the actual channel change in case of a bandwidth downgrade, or after the actual channel change in case of a bandwidth upgrade. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a59187c016e0..22b699460176 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -978,6 +978,10 @@ static void ieee80211_chswitch_work(struct work_struct *work) */ if (sdata->reserved_chanctx) { + struct ieee80211_supported_band *sband = NULL; + struct sta_info *mgd_sta = NULL; + enum ieee80211_sta_rx_bandwidth bw = IEEE80211_STA_RX_BW_20; + /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their @@ -986,6 +990,48 @@ static void ieee80211_chswitch_work(struct work_struct *work) if (sdata->reserved_ready) goto out; + if (sdata->vif.bss_conf.chandef.width != + sdata->csa_chandef.width) { + /* + * For managed interface, we need to also update the AP + * station bandwidth and align the rate scale algorithm + * on the bandwidth change. Here we only consider the + * bandwidth of the new channel definition (as channel + * switch flow does not have the full HT/VHT/HE + * information), assuming that if additional changes are + * required they would be done as part of the processing + * of the next beacon from the AP. + */ + switch (sdata->csa_chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + default: + bw = IEEE80211_STA_RX_BW_20; + break; + case NL80211_CHAN_WIDTH_40: + bw = IEEE80211_STA_RX_BW_40; + break; + case NL80211_CHAN_WIDTH_80: + bw = IEEE80211_STA_RX_BW_80; + break; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + bw = IEEE80211_STA_RX_BW_160; + break; + } + + mgd_sta = sta_info_get(sdata, ifmgd->bssid); + sband = + local->hw.wiphy->bands[sdata->csa_chandef.chan->band]; + } + + if (sdata->vif.bss_conf.chandef.width > + sdata->csa_chandef.width) { + mgd_sta->sta.bandwidth = bw; + rate_control_rate_update(local, sband, mgd_sta, + IEEE80211_RC_BW_CHANGED); + } + ret = ieee80211_vif_use_reserved_context(sdata); if (ret) { sdata_info(sdata, @@ -996,6 +1042,13 @@ static void ieee80211_chswitch_work(struct work_struct *work) goto out; } + if (sdata->vif.bss_conf.chandef.width < + sdata->csa_chandef.width) { + mgd_sta->sta.bandwidth = bw; + rate_control_rate_update(local, sband, mgd_sta, + IEEE80211_RC_BW_CHANGED); + } + goto out; } From 6c18b27d6e5c6a7206364eae2b47bc8d8b2fa68f Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 31 Aug 2018 11:31:12 +0300 Subject: [PATCH 74/91] mac80211: don't Tx a deauth frame if the AP forbade Tx If the driver fails to properly prepare for the channel switch, mac80211 will disconnect. If the CSA IE had mode set to 1, it means that the clients are not allowed to send any Tx on the current channel, and that includes the deauthentication frame. Make sure that we don't send the deauthentication frame in this case. In iwlwifi, this caused a failure to flush queues since the firmware already closed the queues after having parsed the CSA IE. Then mac80211 would wait until the deauthentication frame would go out (drv_flush(drop=false)) and that would never happen. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 22b699460176..b046bf95eb3c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1270,6 +1270,16 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, cbss->beacon_interval)); return; drop_connection: + /* + * This is just so that the disconnect flow will know that + * we were trying to switch channel and failed. In case the + * mode is 1 (we are not allowed to Tx), we will know not to + * send a deauthentication frame. Those two fields will be + * reset when the disconnection worker runs. + */ + sdata->vif.csa_active = true; + sdata->csa_block_tx = csa_ie.mode; + ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); mutex_unlock(&local->mtx); @@ -2453,6 +2463,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; + bool tx; sdata_lock(sdata); if (!ifmgd->associated) { @@ -2460,6 +2471,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) return; } + tx = !sdata->csa_block_tx; + /* AP is probably out of range (or not reachable for another reason) so * remove the bss struct for that AP. */ @@ -2467,7 +2480,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, - true, frame_buf); + tx, frame_buf); mutex_lock(&local->mtx); sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; @@ -2478,7 +2491,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) } mutex_unlock(&local->mtx); - ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, + ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); sdata_unlock(sdata); From c6e57b3896fc76299913b8cfd82d853bee8a2c84 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 31 Aug 2018 11:31:13 +0300 Subject: [PATCH 75/91] mac80211: shorten the IBSS debug messages When tracing is enabled, all the debug messages are recorded and must not exceed MAX_MSG_LEN (100) columns. Longer debug messages grant the user with: WARNING: CPU: 3 PID: 32642 at /tmp/wifi-core-20180806094828/src/iwlwifi-stack-dev/net/mac80211/./trace_msg.h:32 trace_event_raw_event_mac80211_msg_event+0xab/0xc0 [mac80211] Workqueue: phy1 ieee80211_iface_work [mac80211] RIP: 0010:trace_event_raw_event_mac80211_msg_event+0xab/0xc0 [mac80211] Call Trace: __sdata_dbg+0xbd/0x120 [mac80211] ieee80211_ibss_rx_queued_mgmt+0x15f/0x510 [mac80211] ieee80211_iface_work+0x21d/0x320 [mac80211] Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6449a1c2283b..f0f5fedb8caa 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -947,8 +947,8 @@ static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata, if (len < IEEE80211_DEAUTH_FRAME_LEN) return; - ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM BSSID=%pM (reason: %d)\n", - mgmt->sa, mgmt->da, mgmt->bssid, reason); + ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); + ibss_dbg(sdata, "\tBSSID=%pM (reason: %d)\n", mgmt->bssid, reason); sta_info_destroy_addr(sdata, mgmt->sa); } @@ -966,9 +966,9 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); - ibss_dbg(sdata, - "RX Auth SA=%pM DA=%pM BSSID=%pM (auth_transaction=%d)\n", - mgmt->sa, mgmt->da, mgmt->bssid, auth_transaction); + ibss_dbg(sdata, "RX Auth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); + ibss_dbg(sdata, "\tBSSID=%pM (auth_transaction=%d)\n", + mgmt->bssid, auth_transaction); if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1) return; @@ -1175,10 +1175,10 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, rx_timestamp = drv_get_tsf(local, sdata); } - ibss_dbg(sdata, - "RX beacon SA=%pM BSSID=%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", + ibss_dbg(sdata, "RX beacon SA=%pM BSSID=%pM TSF=0x%llx\n", mgmt->sa, mgmt->bssid, - (unsigned long long)rx_timestamp, + (unsigned long long)rx_timestamp); + ibss_dbg(sdata, "\tBCN=0x%llx diff=%lld @%lu\n", (unsigned long long)beacon_timestamp, (unsigned long long)(rx_timestamp - beacon_timestamp), jiffies); @@ -1537,9 +1537,9 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, tx_last_beacon = drv_tx_last_beacon(local); - ibss_dbg(sdata, - "RX ProbeReq SA=%pM DA=%pM BSSID=%pM (tx_last_beacon=%d)\n", - mgmt->sa, mgmt->da, mgmt->bssid, tx_last_beacon); + ibss_dbg(sdata, "RX ProbeReq SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); + ibss_dbg(sdata, "\tBSSID=%pM (tx_last_beacon=%d)\n", + mgmt->bssid, tx_last_beacon); if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da)) return; From 36feaac35405e932ad9c321d7a2db8a7e4a4d7ca Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 31 Aug 2018 16:52:01 +0800 Subject: [PATCH 76/91] ip6_tunnel: respect ttl inherit for ip6tnl man ip-tunnel ttl section says: 0 is a special value meaning that packets inherit the TTL value. IPv4 tunnel respect this in ip_tunnel_xmit(), but IPv6 tunnel has not implement it yet. To make IPv6 behave consistently with IP tunnel, add ipv6 tunnel inherit support. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5df2a58d945c..419960b0ba16 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1188,7 +1188,15 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } - hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); + + if (hop_limit == 0) { + if (skb->protocol == htons(ETH_P_IP)) + hop_limit = ip_hdr(skb)->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + hop_limit = ipv6_hdr(skb)->hop_limit; + else + hop_limit = ip6_dst_hoplimit(dst); + } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. From 9fd0e09a4e86499639653243edfcb417a05c5c46 Mon Sep 17 00:00:00 2001 From: Anthony Wong Date: Fri, 31 Aug 2018 20:06:42 +0800 Subject: [PATCH 77/91] r8169: add support for NCube 8168 network card This card identifies itself as: Ethernet controller [0200]: NCube Device [10ff:8168] (rev 06) Subsystem: TP-LINK Technologies Co., Ltd. Device [7470:3468] Adding a new entry to rtl8169_pci_tbl makes the card work. Link: http://launchpad.net/bugs/1788730 Signed-off-by: Anthony Wong Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 1 + include/linux/pci_ids.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index ac306797590e..b08d51bf7a20 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -218,6 +218,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8161), 0, 0, RTL_CFG_1 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8167), 0, 0, RTL_CFG_0 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8168), 0, 0, RTL_CFG_1 }, + { PCI_DEVICE(PCI_VENDOR_ID_NCUBE, 0x8168), 0, 0, RTL_CFG_1 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8169), 0, 0, RTL_CFG_0 }, { PCI_VENDOR_ID_DLINK, 0x4300, PCI_VENDOR_ID_DLINK, 0x4b10, 0, 0, RTL_CFG_1 }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 99d366cb0e9f..d157983b84cf 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3084,4 +3084,6 @@ #define PCI_VENDOR_ID_OCZ 0x1b85 +#define PCI_VENDOR_ID_NCUBE 0x10ff + #endif /* _LINUX_PCI_IDS_H */ From c48300c92ad9f029f4dcbcf5d71ad880e3acf2fa Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Mon, 3 Sep 2018 20:59:13 +0300 Subject: [PATCH 78/91] vhost: fix VHOST_GET_BACKEND_FEATURES ioctl request definition The _IOC_READ flag fits this ioctl request more because this request actually only writes to, but doesn't read from userspace. See NOTEs in include/uapi/asm-generic/ioctl.h for more information. Fixes: 429711aec282 ("vhost: switch to use new message format") Signed-off-by: Gleb Fotengauer-Malinovskiy Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/vhost.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index b1e22c40c4b6..84c3de89696a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -176,7 +176,7 @@ struct vhost_memory { #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) -#define VHOST_GET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x26, __u64) +#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) /* VHOST_NET specific defines */ From c10bbfae3ae43fae1d77e16f05a73474acf514ff Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 3 Sep 2018 10:04:55 +0300 Subject: [PATCH 79/91] net: sched: null actions array pointer before releasing action Currently, tcf_action_delete() nulls actions array pointer after putting and deleting it. However, if tcf_idr_delete_index() returns an error, pointer to action is not set to null. That results it being released second time in error handling code of tca_action_gd(). Kasan error: [ 807.367755] ================================================================== [ 807.375844] BUG: KASAN: use-after-free in tc_setup_cb_call+0x14e/0x250 [ 807.382763] Read of size 8 at addr ffff88033e636000 by task tc/2732 [ 807.391289] CPU: 0 PID: 2732 Comm: tc Tainted: G W 4.19.0-rc1+ #799 [ 807.399542] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 807.407948] Call Trace: [ 807.410763] dump_stack+0x92/0xeb [ 807.414456] print_address_description+0x70/0x360 [ 807.419549] kasan_report+0x14d/0x300 [ 807.423582] ? tc_setup_cb_call+0x14e/0x250 [ 807.428150] tc_setup_cb_call+0x14e/0x250 [ 807.432539] ? nla_put+0x65/0xe0 [ 807.436146] fl_dump+0x394/0x3f0 [cls_flower] [ 807.440890] ? fl_tmplt_dump+0x140/0x140 [cls_flower] [ 807.446327] ? lock_downgrade+0x320/0x320 [ 807.450702] ? lock_acquire+0xe2/0x220 [ 807.454819] ? is_bpf_text_address+0x5/0x140 [ 807.459475] ? memcpy+0x34/0x50 [ 807.462980] ? nla_put+0x65/0xe0 [ 807.466582] tcf_fill_node+0x341/0x430 [ 807.470717] ? tcf_block_put+0xe0/0xe0 [ 807.474859] tcf_node_dump+0xdb/0xf0 [ 807.478821] fl_walk+0x8e/0x170 [cls_flower] [ 807.483474] tcf_chain_dump+0x35a/0x4d0 [ 807.487703] ? tfilter_notify+0x170/0x170 [ 807.492091] ? tcf_fill_node+0x430/0x430 [ 807.496411] tc_dump_tfilter+0x362/0x3f0 [ 807.500712] ? tc_del_tfilter+0x850/0x850 [ 807.505104] ? kasan_unpoison_shadow+0x30/0x40 [ 807.509940] ? __mutex_unlock_slowpath+0xcf/0x410 [ 807.515031] netlink_dump+0x263/0x4f0 [ 807.519077] __netlink_dump_start+0x2a0/0x300 [ 807.523817] ? tc_del_tfilter+0x850/0x850 [ 807.528198] rtnetlink_rcv_msg+0x46a/0x6d0 [ 807.532671] ? rtnl_fdb_del+0x3f0/0x3f0 [ 807.536878] ? tc_del_tfilter+0x850/0x850 [ 807.541280] netlink_rcv_skb+0x18d/0x200 [ 807.545570] ? rtnl_fdb_del+0x3f0/0x3f0 [ 807.549773] ? netlink_ack+0x500/0x500 [ 807.553913] netlink_unicast+0x2d0/0x370 [ 807.558212] ? netlink_attachskb+0x340/0x340 [ 807.562855] ? _copy_from_iter_full+0xe9/0x3e0 [ 807.567677] ? import_iovec+0x11e/0x1c0 [ 807.571890] netlink_sendmsg+0x3b9/0x6a0 [ 807.576192] ? netlink_unicast+0x370/0x370 [ 807.580684] ? netlink_unicast+0x370/0x370 [ 807.585154] sock_sendmsg+0x6b/0x80 [ 807.589015] ___sys_sendmsg+0x4a1/0x520 [ 807.593230] ? copy_msghdr_from_user+0x210/0x210 [ 807.598232] ? do_wp_page+0x174/0x880 [ 807.602276] ? __handle_mm_fault+0x749/0x1c10 [ 807.607021] ? __handle_mm_fault+0x1046/0x1c10 [ 807.611849] ? __pmd_alloc+0x320/0x320 [ 807.615973] ? check_chain_key+0x140/0x1f0 [ 807.620450] ? check_chain_key+0x140/0x1f0 [ 807.624929] ? __fget_light+0xbc/0xd0 [ 807.628970] ? __sys_sendmsg+0xd7/0x150 [ 807.633172] __sys_sendmsg+0xd7/0x150 [ 807.637201] ? __ia32_sys_shutdown+0x30/0x30 [ 807.641846] ? up_read+0x53/0x90 [ 807.645442] ? __do_page_fault+0x484/0x780 [ 807.649949] ? do_syscall_64+0x1e/0x2c0 [ 807.654164] do_syscall_64+0x72/0x2c0 [ 807.658198] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 807.663625] RIP: 0033:0x7f42e9870150 [ 807.667568] Code: 8b 15 3c 7d 2b 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb cd 66 0f 1f 44 00 00 83 3d b9 d5 2b 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be cd 00 00 48 89 04 24 [ 807.687328] RSP: 002b:00007ffdbf595b58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 807.695564] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f42e9870150 [ 807.703083] RDX: 0000000000000000 RSI: 00007ffdbf595b80 RDI: 0000000000000003 [ 807.710605] RBP: 00007ffdbf599d90 R08: 0000000000679bc0 R09: 000000000000000f [ 807.718127] R10: 00000000000005e7 R11: 0000000000000246 R12: 00007ffdbf599d88 [ 807.725651] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 807.735048] Allocated by task 2687: [ 807.738902] kasan_kmalloc+0xa0/0xd0 [ 807.742852] __kmalloc+0x118/0x2d0 [ 807.746615] tcf_idr_create+0x44/0x320 [ 807.750738] tcf_nat_init+0x41e/0x530 [act_nat] [ 807.755638] tcf_action_init_1+0x4e0/0x650 [ 807.760104] tcf_action_init+0x1ce/0x2d0 [ 807.764395] tcf_exts_validate+0x1d8/0x200 [ 807.768861] fl_change+0x55a/0x26b4 [cls_flower] [ 807.773845] tc_new_tfilter+0x748/0xa20 [ 807.778051] rtnetlink_rcv_msg+0x56a/0x6d0 [ 807.782517] netlink_rcv_skb+0x18d/0x200 [ 807.786804] netlink_unicast+0x2d0/0x370 [ 807.791095] netlink_sendmsg+0x3b9/0x6a0 [ 807.795387] sock_sendmsg+0x6b/0x80 [ 807.799240] ___sys_sendmsg+0x4a1/0x520 [ 807.803445] __sys_sendmsg+0xd7/0x150 [ 807.807473] do_syscall_64+0x72/0x2c0 [ 807.811506] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 807.818776] Freed by task 2728: [ 807.822283] __kasan_slab_free+0x122/0x180 [ 807.826752] kfree+0xf4/0x2f0 [ 807.830080] __tcf_action_put+0x5a/0xb0 [ 807.834281] tcf_action_put_many+0x46/0x70 [ 807.838747] tca_action_gd+0x232/0xc40 [ 807.842862] tc_ctl_action+0x215/0x230 [ 807.846977] rtnetlink_rcv_msg+0x56a/0x6d0 [ 807.851444] netlink_rcv_skb+0x18d/0x200 [ 807.855731] netlink_unicast+0x2d0/0x370 [ 807.860021] netlink_sendmsg+0x3b9/0x6a0 [ 807.864312] sock_sendmsg+0x6b/0x80 [ 807.868166] ___sys_sendmsg+0x4a1/0x520 [ 807.872372] __sys_sendmsg+0xd7/0x150 [ 807.876401] do_syscall_64+0x72/0x2c0 [ 807.880431] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 807.887704] The buggy address belongs to the object at ffff88033e636000 which belongs to the cache kmalloc-256 of size 256 [ 807.900909] The buggy address is located 0 bytes inside of 256-byte region [ffff88033e636000, ffff88033e636100) [ 807.913155] The buggy address belongs to the page: [ 807.918322] page:ffffea000cf98d80 count:1 mapcount:0 mapping:ffff88036f80ee00 index:0x0 compound_mapcount: 0 [ 807.928831] flags: 0x5fff8000008100(slab|head) [ 807.933647] raw: 005fff8000008100 ffffea000db44f00 0000000400000004 ffff88036f80ee00 [ 807.942050] raw: 0000000000000000 0000000080190019 00000001ffffffff 0000000000000000 [ 807.950456] page dumped because: kasan: bad access detected [ 807.958240] Memory state around the buggy address: [ 807.963405] ffff88033e635f00: fc fc fc fc fb fb fb fb fb fb fb fc fc fc fc fb [ 807.971288] ffff88033e635f80: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc [ 807.979166] >ffff88033e636000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 807.994882] ^ [ 807.998477] ffff88033e636080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 808.006352] ffff88033e636100: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb [ 808.014230] ================================================================== [ 808.022108] Disabling lock debugging due to kernel taint Fixes: edfaf94fa705 ("net_sched: improve and refactor tcf_action_put_many()") Signed-off-by: Vlad Buslov Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 316c98bb87e4..e12f8ef7baa4 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1179,6 +1179,7 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[]) struct tcf_idrinfo *idrinfo = a->idrinfo; u32 act_index = a->tcfa_index; + actions[i] = NULL; if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); @@ -1190,7 +1191,6 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[]) if (ret < 0) return ret; } - actions[i] = NULL; } return 0; } From bf68066fccb10fce6bbffdda24ee2ae314c9c5b2 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Mon, 3 Sep 2018 10:26:28 +0300 Subject: [PATCH 80/91] net/ibm/emac: wrong emac_calc_base call was used by typo __emac_calc_base_mr1 was used instead of __emac4_calc_base_mr1 by copy-paste mistake for emac4syn. Fixes: 45d6e545505fd32edb812f085be7de45b6a5c0af ("net/ibm/emac: add 8192 rx/tx fifo size") Signed-off-by: Ivan Mikhaylov Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 354c0982847b..372664686309 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -494,9 +494,6 @@ static u32 __emac_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_s case 16384: ret |= EMAC_MR1_RFS_16K; break; - case 8192: - ret |= EMAC4_MR1_RFS_8K; - break; case 4096: ret |= EMAC_MR1_RFS_4K; break; @@ -537,6 +534,9 @@ static u32 __emac4_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_ case 16384: ret |= EMAC4_MR1_RFS_16K; break; + case 8192: + ret |= EMAC4_MR1_RFS_8K; + break; case 4096: ret |= EMAC4_MR1_RFS_4K; break; From af8a2b8ba7678b4695f9e854ba9abae1076beabe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 Sep 2018 15:47:10 +0800 Subject: [PATCH 81/91] sctp: fix invalid reference to the index variable of the iterator Now in sctp_apply_peer_addr_params(), if SPP_IPV6_FLOWLABEL flag is set and trans is NULL, it would use trans as the index variable to traverse transport_addr_list, then trans is set as the last transport of it. Later, if SPP_DSCP flag is set, it would enter into the wrong branch as trans is actually an invalid reference. So fix it by using a new index variable to traverse transport_addr_list for both SPP_DSCP and SPP_IPV6_FLOWLABEL flags process. Fixes: 0b0dce7a36fb ("sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams") Reported-by: Julia Lawall Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aa76586a1a1c..a0ccfa4b8220 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2663,14 +2663,15 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, SCTP_FLOWLABEL_VAL_MASK; trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } else if (asoc) { - list_for_each_entry(trans, - &asoc->peer.transport_addr_list, + struct sctp_transport *t; + + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - if (trans->ipaddr.sa.sa_family != AF_INET6) + if (t->ipaddr.sa.sa_family != AF_INET6) continue; - trans->flowlabel = params->spp_ipv6_flowlabel & - SCTP_FLOWLABEL_VAL_MASK; - trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + t->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + t->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } asoc->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; @@ -2687,12 +2688,13 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; trans->dscp |= SCTP_DSCP_SET_MASK; } else if (asoc) { - list_for_each_entry(trans, - &asoc->peer.transport_addr_list, + struct sctp_transport *t; + + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - trans->dscp = params->spp_dscp & - SCTP_DSCP_VAL_MASK; - trans->dscp |= SCTP_DSCP_SET_MASK; + t->dscp = params->spp_dscp & + SCTP_DSCP_VAL_MASK; + t->dscp |= SCTP_DSCP_SET_MASK; } asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; asoc->dscp |= SCTP_DSCP_SET_MASK; From 741880e1f2f59b20125dc480765d2546cec66080 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 Sep 2018 15:47:11 +0800 Subject: [PATCH 82/91] sctp: not traverse asoc trans list if non-ipv6 trans exists for ipv6_flowlabel When users set params.spp_address and get a trans, ipv6_flowlabel flag should be applied into this trans. But even if this one is not an ipv6 trans, it should not go to apply it into all other transes of the asoc but simply ignore it. Fixes: 0b0dce7a36fb ("sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a0ccfa4b8220..f73e9d38d5ba 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2658,10 +2658,12 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } if (params->spp_flags & SPP_IPV6_FLOWLABEL) { - if (trans && trans->ipaddr.sa.sa_family == AF_INET6) { - trans->flowlabel = params->spp_ipv6_flowlabel & - SCTP_FLOWLABEL_VAL_MASK; - trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + if (trans) { + if (trans->ipaddr.sa.sa_family == AF_INET6) { + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } } else if (asoc) { struct sctp_transport *t; From 6b95c3e9697254dab0c8eafc6ab9d5e10d2eca4e Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 3 Sep 2018 04:23:17 -0400 Subject: [PATCH 83/91] bnxt_en: Fix firmware signaled resource change logic in open. When the driver detects that resources have changed during open, it should reset the rx and tx rings to 0. This will properly setup the init sequence to initialize the default rings again. We also need to signal the RDMA driver to stop and clear its interrupts. We then call the RoCE driver to restart if a new set of default rings is successfully reserved. Fixes: 25e1acd6b92b ("bnxt_en: Notify firmware about IF state changes.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8bb1e38b1681..6a1baf375ac3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6684,6 +6684,8 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) hw_resc->resv_rx_rings = 0; hw_resc->resv_hw_ring_grps = 0; hw_resc->resv_vnics = 0; + bp->tx_nr_rings = 0; + bp->rx_nr_rings = 0; } return rc; } @@ -8769,20 +8771,25 @@ static int bnxt_init_dflt_ring_mode(struct bnxt *bp) if (bp->tx_nr_rings) return 0; + bnxt_ulp_irq_stop(bp); + bnxt_clear_int_mode(bp); rc = bnxt_set_dflt_rings(bp, true); if (rc) { netdev_err(bp->dev, "Not enough rings available.\n"); - return rc; + goto init_dflt_ring_err; } rc = bnxt_init_int_mode(bp); if (rc) - return rc; + goto init_dflt_ring_err; + bp->tx_nr_rings_per_tc = bp->tx_nr_rings; if (bnxt_rfs_supported(bp) && bnxt_rfs_capable(bp)) { bp->flags |= BNXT_FLAG_RFS; bp->dev->features |= NETIF_F_NTUPLE; } - return 0; +init_dflt_ring_err: + bnxt_ulp_irq_restart(bp, rc); + return rc; } int bnxt_restore_pf_fw_resources(struct bnxt *bp) From ad95c27bdb930105f3eea02621bda157caf2862d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 3 Sep 2018 04:23:18 -0400 Subject: [PATCH 84/91] bnxt_en: Clean up unused functions. Remove unused bnxt_subtract_ulp_resources(). Change bnxt_get_max_func_irqs() to static since it is only locally used. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 - drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 15 --------------- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 1 - 4 files changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6a1baf375ac3..6472ce447f87 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5918,7 +5918,7 @@ void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max) bp->hw_resc.max_cp_rings = max; } -unsigned int bnxt_get_max_func_irqs(struct bnxt *bp) +static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index fefa011320e0..c4c77b9fa77b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1482,7 +1482,6 @@ unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp); void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max); unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp); void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max); -unsigned int bnxt_get_max_func_irqs(struct bnxt *bp); int bnxt_get_avail_msix(struct bnxt *bp, int num); int bnxt_reserve_rings(struct bnxt *bp); void bnxt_tx_disable(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index c37b2842f972..deac73e8e0f7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -220,21 +220,6 @@ int bnxt_get_ulp_msix_base(struct bnxt *bp) return 0; } -void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id) -{ - ASSERT_RTNL(); - if (bnxt_ulp_registered(bp->edev, ulp_id)) { - struct bnxt_en_dev *edev = bp->edev; - unsigned int msix_req, max; - - msix_req = edev->ulp_tbl[ulp_id].msix_requested; - max = bnxt_get_max_func_cp_rings(bp); - bnxt_set_max_func_cp_rings(bp, max - msix_req); - max = bnxt_get_max_func_stat_ctxs(bp); - bnxt_set_max_func_stat_ctxs(bp, max - 1); - } -} - static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id, struct bnxt_fw_msg *fw_msg) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index df48ac71729f..d9bea37cd211 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -90,7 +90,6 @@ static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev, int ulp_id) int bnxt_get_ulp_msix_num(struct bnxt *bp); int bnxt_get_ulp_msix_base(struct bnxt *bp); -void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id); void bnxt_ulp_stop(struct bnxt *bp); void bnxt_ulp_start(struct bnxt *bp); void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs); From 00fe9c326d2027f2437dea38ef0e82f9d02d94c0 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 3 Sep 2018 04:23:19 -0400 Subject: [PATCH 85/91] bnxt_en: Do not adjust max_cp_rings by the ones used by RDMA. Currently, the driver adjusts the bp->hw_resc.max_cp_rings by the number of MSIX vectors used by RDMA. There is one code path in open that needs to check the true max_cp_rings including any used by RDMA. This code is now checking for the reduced max_cp_rings which will fail when the number of cp rings is very small. To fix this in a clean way, we don't adjust max_cp_rings anymore. Instead, we add a helper bnxt_get_max_func_cp_rings_for_en() to get the reduced max_cp_rings when appropriate. Fixes: ec86f14ea506 ("bnxt_en: Add ULP calls to stop and restart IRQs.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 ++++--- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 7 ++++--- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 5 ----- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6472ce447f87..cecbb1d1f587 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5913,9 +5913,9 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp) return bp->hw_resc.max_cp_rings; } -void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max) +unsigned int bnxt_get_max_func_cp_rings_for_en(struct bnxt *bp) { - bp->hw_resc.max_cp_rings = max; + return bp->hw_resc.max_cp_rings - bnxt_get_ulp_msix_num(bp); } static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp) @@ -8631,7 +8631,8 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, *max_tx = hw_resc->max_tx_rings; *max_rx = hw_resc->max_rx_rings; - *max_cp = min_t(int, hw_resc->max_irqs, hw_resc->max_cp_rings); + *max_cp = min_t(int, bnxt_get_max_func_cp_rings_for_en(bp), + hw_resc->max_irqs); *max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs); max_ring_grps = hw_resc->max_hw_ring_grps; if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index c4c77b9fa77b..bde384630a75 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1481,7 +1481,7 @@ int bnxt_hwrm_set_coal(struct bnxt *); unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp); void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max); unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp); -void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max); +unsigned int bnxt_get_max_func_cp_rings_for_en(struct bnxt *bp); int bnxt_get_avail_msix(struct bnxt *bp, int num); int bnxt_reserve_rings(struct bnxt *bp); void bnxt_tx_disable(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 6d583bcd2a81..fcd085a9853a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -451,7 +451,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs) bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_RESOURCE_CFG, -1, -1); - vf_cp_rings = hw_resc->max_cp_rings - bp->cp_nr_rings; + vf_cp_rings = bnxt_get_max_func_cp_rings_for_en(bp) - bp->cp_nr_rings; vf_stat_ctx = hw_resc->max_stat_ctxs - bp->num_stat_ctxs; if (bp->flags & BNXT_FLAG_AGG_RINGS) vf_rx_rings = hw_resc->max_rx_rings - bp->rx_nr_rings * 2; @@ -549,7 +549,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs) max_stat_ctxs = hw_resc->max_stat_ctxs; /* Remaining rings are distributed equally amongs VF's for now */ - vf_cp_rings = (hw_resc->max_cp_rings - bp->cp_nr_rings) / num_vfs; + vf_cp_rings = (bnxt_get_max_func_cp_rings_for_en(bp) - + bp->cp_nr_rings) / num_vfs; vf_stat_ctx = (max_stat_ctxs - bp->num_stat_ctxs) / num_vfs; if (bp->flags & BNXT_FLAG_AGG_RINGS) vf_rx_rings = (hw_resc->max_rx_rings - bp->rx_nr_rings * 2) / @@ -643,7 +644,7 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs) */ vfs_supported = *num_vfs; - avail_cp = hw_resc->max_cp_rings - bp->cp_nr_rings; + avail_cp = bnxt_get_max_func_cp_rings_for_en(bp) - bp->cp_nr_rings; avail_stat = hw_resc->max_stat_ctxs - bp->num_stat_ctxs; avail_cp = min_t(int, avail_cp, avail_stat); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index deac73e8e0f7..beee61292d5e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -169,7 +169,6 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id, edev->ulp_tbl[ulp_id].msix_requested = avail_msix; } bnxt_fill_msix_vecs(bp, ent); - bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix); edev->flags |= BNXT_EN_FLAG_MSIX_REQUESTED; return avail_msix; } @@ -178,7 +177,6 @@ static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id) { struct net_device *dev = edev->net; struct bnxt *bp = netdev_priv(dev); - int max_cp_rings, msix_requested; ASSERT_RTNL(); if (ulp_id != BNXT_ROCE_ULP) @@ -187,9 +185,6 @@ static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id) if (!(edev->flags & BNXT_EN_FLAG_MSIX_REQUESTED)) return 0; - max_cp_rings = bnxt_get_max_func_cp_rings(bp); - msix_requested = edev->ulp_tbl[ulp_id].msix_requested; - bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested); edev->ulp_tbl[ulp_id].msix_requested = 0; edev->flags &= ~BNXT_EN_FLAG_MSIX_REQUESTED; if (netif_running(dev)) { From 9cc1bf3928b31e515ed15477b3c7eb653d0b3b42 Mon Sep 17 00:00:00 2001 From: Zhenbo Gao Date: Mon, 3 Sep 2018 16:36:45 +0800 Subject: [PATCH 86/91] tipc: correct spelling errors for struct tipc_bc_base's comment Trivial fix for two spelling mistakes. Signed-off-by: Zhenbo Gao Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 9ee6cfea56dd..d8026543bf4c 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -51,12 +51,12 @@ const char tipc_bclink_name[] = "broadcast-link"; * struct tipc_bc_base - base structure for keeping broadcast send state * @link: broadcast send link structure * @inputq: data input queue; will only carry SOCK_WAKEUP messages - * @dest: array keeping number of reachable destinations per bearer + * @dests: array keeping number of reachable destinations per bearer * @primary_bearer: a bearer having links to all broadcast destinations, if any * @bcast_support: indicates if primary bearer, if any, supports broadcast * @rcast_support: indicates if all peer nodes support replicast * @rc_ratio: dest count as percentage of cluster size where send method changes - * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast + * @bc_threshold: calculated from rc_ratio; if dests > threshold use broadcast */ struct tipc_bc_base { struct tipc_link *link; From a484ef3442d2f05fa59edf4f6d14a8169d1b94a6 Mon Sep 17 00:00:00 2001 From: Zhenbo Gao Date: Mon, 3 Sep 2018 16:36:46 +0800 Subject: [PATCH 87/91] tipc: correct spelling errors for tipc_topsrv_queue_evt() comments tipc_conn_queue_evt -> tipc_topsrv_queue_evt tipc_send_work -> tipc_conn_send_work tipc_send_to_sock -> tipc_conn_send_to_sock Signed-off-by: Zhenbo Gao Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index c8e34ef22c30..2627b5d812e9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -313,8 +313,8 @@ static void tipc_conn_send_work(struct work_struct *work) conn_put(con); } -/* tipc_conn_queue_evt() - interrupt level call from a subscription instance - * The queued work is launched into tipc_send_work()->tipc_send_to_sock() +/* tipc_topsrv_queue_evt() - interrupt level call from a subscription instance + * The queued work is launched into tipc_conn_send_work()->tipc_conn_send_to_sock() */ void tipc_topsrv_queue_evt(struct net *net, int conid, u32 event, struct tipc_event *evt) From 639505d4397b8c654a8e2616f9cb70ece40c83f9 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 3 Sep 2018 18:06:24 +0300 Subject: [PATCH 88/91] net/mlx5: Fix SQ offset in QPs with small RQ Correct the formula for calculating the RQ page remainder, which should be in byte granularity. The result will be non-zero only for RQs smaller than PAGE_SIZE, as an RQ size is a power of 2. Divide this by the SQ stride (MLX5_SEND_WQE_BB) to get the SQ offset in strides granularity. Fixes: d7037ad73daa ("net/mlx5: Fix QP fragmented buffer allocation") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 86478a6b99c5..c8c315eb5128 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -139,14 +139,15 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, struct mlx5_wq_ctrl *wq_ctrl) { u32 sq_strides_offset; + u32 rq_pg_remainder; int err; mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4, MLX5_GET(qpc, qpc, log_rq_size), &wq->rq.fbc); - sq_strides_offset = - ((wq->rq.fbc.frag_sz_m1 + 1) % PAGE_SIZE) / MLX5_SEND_WQE_BB; + rq_pg_remainder = mlx5_wq_cyc_get_byte_size(&wq->rq) % PAGE_SIZE; + sq_strides_offset = rq_pg_remainder / MLX5_SEND_WQE_BB; mlx5_fill_fbc_offset(ilog2(MLX5_SEND_WQE_BB), MLX5_GET(qpc, qpc, log_sq_size), From 6d784f1625ea68783cc1fb17de8f6cd3e1660c3f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 3 Sep 2018 11:08:15 -0700 Subject: [PATCH 89/91] act_ife: fix a potential use-after-free Immediately after module_put(), user could delete this module, so e->ops could be already freed before we call e->ops->release(). Fix this by moving module_put() after ops->release(). Fixes: ef6980b6becb ("introduce IFE action") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 196430aefe87..fc412769a1be 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -400,7 +400,6 @@ static void _tcf_ife_cleanup(struct tc_action *a) struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { - module_put(e->ops->owner); list_del(&e->metalist); if (e->metaval) { if (e->ops->release) @@ -408,6 +407,7 @@ static void _tcf_ife_cleanup(struct tc_action *a) else kfree(e->metaval); } + module_put(e->ops->owner); kfree(e); } } From 84cb8eb26cb9ce3c79928094962a475a9d850a53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 4 Sep 2018 00:44:42 +0300 Subject: [PATCH 90/91] net: sched: action_ife: take reference to meta module Recent refactoring of add_metainfo() caused use_all_metadata() to add metainfo to ife action metalist without taking reference to module. This causes warning in module_put called from ife action cleanup function. Implement add_metainfo_and_get_ops() function that returns with reference to module taken if metainfo was added successfully, and call it from use_all_metadata(), instead of calling __add_metainfo() directly. Example warning: [ 646.344393] WARNING: CPU: 1 PID: 2278 at kernel/module.c:1139 module_put+0x1cb/0x230 [ 646.352437] Modules linked in: act_meta_skbtcindex act_meta_mark act_meta_skbprio act_ife ife veth nfsv3 nfs fscache xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c tun ebtable_filter ebtables ip6table_filter ip6_tables bridge stp llc mlx5_ib ib_uverbs ib_core intel_rapl sb_edac x86_pkg_temp_thermal mlx5_core coretemp kvm_intel kvm nfsd igb irqbypass crct10dif_pclmul devlink crc32_pclmul mei_me joydev ses crc32c_intel enclosure auth_rpcgss i2c_algo_bit ioatdma ptp mei pps_core ghash_clmulni_intel iTCO_wdt iTCO_vendor_support pcspkr dca ipmi_ssif lpc_ich target_core_mod i2c_i801 ipmi_si ipmi_devintf pcc_cpufreq wmi ipmi_msghandler nfs_acl lockd acpi_pad acpi_power_meter grace sunrpc mpt3sas raid_class scsi_transport_sas [ 646.425631] CPU: 1 PID: 2278 Comm: tc Not tainted 4.19.0-rc1+ #799 [ 646.432187] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 646.440595] RIP: 0010:module_put+0x1cb/0x230 [ 646.445238] Code: f3 66 94 02 e8 26 ff fa ff 85 c0 74 11 0f b6 1d 51 30 94 02 80 fb 01 77 60 83 e3 01 74 13 65 ff 0d 3a 83 db 73 e9 2b ff ff ff <0f> 0b e9 00 ff ff ff e8 59 01 fb ff 85 c0 75 e4 48 c7 c2 20 62 6b [ 646.464997] RSP: 0018:ffff880354d37068 EFLAGS: 00010286 [ 646.470599] RAX: 0000000000000000 RBX: ffffffffc0a52518 RCX: ffffffff8c2668db [ 646.478118] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffffffffc0a52518 [ 646.485641] RBP: ffffffffc0a52180 R08: fffffbfff814a4a4 R09: fffffbfff814a4a3 [ 646.493164] R10: ffffffffc0a5251b R11: fffffbfff814a4a4 R12: 1ffff1006a9a6e0d [ 646.500687] R13: 00000000ffffffff R14: ffff880362bab890 R15: dead000000000100 [ 646.508213] FS: 00007f4164c99800(0000) GS:ffff88036fe40000(0000) knlGS:0000000000000000 [ 646.516961] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 646.523080] CR2: 00007f41638b8420 CR3: 0000000351df0004 CR4: 00000000001606e0 [ 646.530595] Call Trace: [ 646.533408] ? find_symbol_in_section+0x260/0x260 [ 646.538509] tcf_ife_cleanup+0x11b/0x200 [act_ife] [ 646.543695] tcf_action_cleanup+0x29/0xa0 [ 646.548078] __tcf_action_put+0x5a/0xb0 [ 646.552289] ? nla_put+0x65/0xe0 [ 646.555889] __tcf_idr_release+0x48/0x60 [ 646.560187] tcf_generic_walker+0x448/0x6b0 [ 646.564764] ? tcf_action_dump_1+0x450/0x450 [ 646.569411] ? __lock_is_held+0x84/0x110 [ 646.573720] ? tcf_ife_walker+0x10c/0x20f [act_ife] [ 646.578982] tca_action_gd+0x972/0xc40 [ 646.583129] ? tca_get_fill.constprop.17+0x250/0x250 [ 646.588471] ? mark_lock+0xcf/0x980 [ 646.592324] ? check_chain_key+0x140/0x1f0 [ 646.596832] ? debug_show_all_locks+0x240/0x240 [ 646.601839] ? memset+0x1f/0x40 [ 646.605350] ? nla_parse+0xca/0x1a0 [ 646.609217] tc_ctl_action+0x215/0x230 [ 646.613339] ? tcf_action_add+0x220/0x220 [ 646.617748] rtnetlink_rcv_msg+0x56a/0x6d0 [ 646.622227] ? rtnl_fdb_del+0x3f0/0x3f0 [ 646.626466] netlink_rcv_skb+0x18d/0x200 [ 646.630752] ? rtnl_fdb_del+0x3f0/0x3f0 [ 646.634959] ? netlink_ack+0x500/0x500 [ 646.639106] netlink_unicast+0x2d0/0x370 [ 646.643409] ? netlink_attachskb+0x340/0x340 [ 646.648050] ? _copy_from_iter_full+0xe9/0x3e0 [ 646.652870] ? import_iovec+0x11e/0x1c0 [ 646.657083] netlink_sendmsg+0x3b9/0x6a0 [ 646.661388] ? netlink_unicast+0x370/0x370 [ 646.665877] ? netlink_unicast+0x370/0x370 [ 646.670351] sock_sendmsg+0x6b/0x80 [ 646.674212] ___sys_sendmsg+0x4a1/0x520 [ 646.678443] ? copy_msghdr_from_user+0x210/0x210 [ 646.683463] ? lock_downgrade+0x320/0x320 [ 646.687849] ? debug_show_all_locks+0x240/0x240 [ 646.692760] ? do_raw_spin_unlock+0xa2/0x130 [ 646.697418] ? _raw_spin_unlock+0x24/0x30 [ 646.701798] ? __handle_mm_fault+0x1819/0x1c10 [ 646.706619] ? __pmd_alloc+0x320/0x320 [ 646.710738] ? debug_show_all_locks+0x240/0x240 [ 646.715649] ? restore_nameidata+0x7b/0xa0 [ 646.720117] ? check_chain_key+0x140/0x1f0 [ 646.724590] ? check_chain_key+0x140/0x1f0 [ 646.729070] ? __fget_light+0xbc/0xd0 [ 646.733121] ? __sys_sendmsg+0xd7/0x150 [ 646.737329] __sys_sendmsg+0xd7/0x150 [ 646.741359] ? __ia32_sys_shutdown+0x30/0x30 [ 646.746003] ? up_read+0x53/0x90 [ 646.749601] ? __do_page_fault+0x484/0x780 [ 646.754105] ? do_syscall_64+0x1e/0x2c0 [ 646.758320] do_syscall_64+0x72/0x2c0 [ 646.762353] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 646.767776] RIP: 0033:0x7f4163872150 [ 646.771713] Code: 8b 15 3c 7d 2b 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb cd 66 0f 1f 44 00 00 83 3d b9 d5 2b 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be cd 00 00 48 89 04 24 [ 646.791474] RSP: 002b:00007ffdef7d6b58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 646.799721] RAX: ffffffffffffffda RBX: 0000000000000024 RCX: 00007f4163872150 [ 646.807240] RDX: 0000000000000000 RSI: 00007ffdef7d6bd0 RDI: 0000000000000003 [ 646.814760] RBP: 000000005b8b9482 R08: 0000000000000001 R09: 0000000000000000 [ 646.822286] R10: 00000000000005e7 R11: 0000000000000246 R12: 00007ffdef7dad20 [ 646.829807] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000679bc0 [ 646.837360] irq event stamp: 6083 [ 646.841043] hardirqs last enabled at (6081): [] __call_rcu+0x17d/0x500 [ 646.849882] hardirqs last disabled at (6083): [] trace_hardirqs_off_thunk+0x1a/0x1c [ 646.859775] softirqs last enabled at (5968): [] __do_softirq+0x4a1/0x6ee [ 646.868784] softirqs last disabled at (6082): [] tcf_ife_cleanup+0x39/0x200 [act_ife] [ 646.878845] ---[ end trace b1b8c12ffe51e657 ]--- Fixes: 5ffe57da29b3 ("act_ife: fix a potential deadlock") Signed-off-by: Vlad Buslov Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index fc412769a1be..06a3d4801878 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -326,6 +326,20 @@ static int __add_metainfo(const struct tcf_meta_ops *ops, return ret; } +static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, + struct tcf_ife_info *ife, u32 metaid, + bool exists) +{ + int ret; + + if (!try_module_get(ops->owner)) + return -ENOENT; + ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); + if (ret) + module_put(ops->owner); + return ret; +} + static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool exists) { @@ -349,7 +363,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { - rc = __add_metainfo(o, ife, o->metaid, NULL, 0, true, exists); + rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); if (rc == 0) installed += 1; } From a33710bdb6b284f8f1e24f1119d167037b374ebb Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 4 Sep 2018 04:23:56 +0200 Subject: [PATCH 91/91] net: phy: sfp: Handle unimplemented hwmon limits and alarms Not all SFPs implement the registers containing sensor limits and alarms. Luckily, there is a bit indicating if they are implemented or not. Add checking for this bit, when deciding if the hwmon attributes should be visible. Fixes: 1323061a018a ("net: phy: sfp: Add HWMON support for module sensors") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 4637d980310e..52fffb98fde9 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -398,7 +398,6 @@ static umode_t sfp_hwmon_is_visible(const void *data, switch (type) { case hwmon_temp: switch (attr) { - case hwmon_temp_input: case hwmon_temp_min_alarm: case hwmon_temp_max_alarm: case hwmon_temp_lcrit_alarm: @@ -407,13 +406,16 @@ static umode_t sfp_hwmon_is_visible(const void *data, case hwmon_temp_max: case hwmon_temp_lcrit: case hwmon_temp_crit: + if (!(sfp->id.ext.enhopts & SFP_ENHOPTS_ALARMWARN)) + return 0; + /* fall through */ + case hwmon_temp_input: return 0444; default: return 0; } case hwmon_in: switch (attr) { - case hwmon_in_input: case hwmon_in_min_alarm: case hwmon_in_max_alarm: case hwmon_in_lcrit_alarm: @@ -422,13 +424,16 @@ static umode_t sfp_hwmon_is_visible(const void *data, case hwmon_in_max: case hwmon_in_lcrit: case hwmon_in_crit: + if (!(sfp->id.ext.enhopts & SFP_ENHOPTS_ALARMWARN)) + return 0; + /* fall through */ + case hwmon_in_input: return 0444; default: return 0; } case hwmon_curr: switch (attr) { - case hwmon_curr_input: case hwmon_curr_min_alarm: case hwmon_curr_max_alarm: case hwmon_curr_lcrit_alarm: @@ -437,6 +442,10 @@ static umode_t sfp_hwmon_is_visible(const void *data, case hwmon_curr_max: case hwmon_curr_lcrit: case hwmon_curr_crit: + if (!(sfp->id.ext.enhopts & SFP_ENHOPTS_ALARMWARN)) + return 0; + /* fall through */ + case hwmon_curr_input: return 0444; default: return 0; @@ -452,7 +461,6 @@ static umode_t sfp_hwmon_is_visible(const void *data, channel == 1) return 0; switch (attr) { - case hwmon_power_input: case hwmon_power_min_alarm: case hwmon_power_max_alarm: case hwmon_power_lcrit_alarm: @@ -461,6 +469,10 @@ static umode_t sfp_hwmon_is_visible(const void *data, case hwmon_power_max: case hwmon_power_lcrit: case hwmon_power_crit: + if (!(sfp->id.ext.enhopts & SFP_ENHOPTS_ALARMWARN)) + return 0; + /* fall through */ + case hwmon_power_input: return 0444; default: return 0;