Merge branch 'hv_netvsc-Support-LRO-RSC-in-the-vSwitch'

Haiyang Zhang says:

====================
hv_netvsc: Support LRO/RSC in the vSwitch

The patch adds support for LRO/RSC in the vSwitch feature. It reduces
the per packet processing overhead by coalescing multiple TCP segments
when possible. The feature is enabled by default on VMs running on
Windows Server 2019 and later.

The patch set also adds ethtool command handler and documents.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-09-22 17:23:16 -07:00
commit 739d0def85
5 changed files with 194 additions and 39 deletions

View File

@ -45,6 +45,15 @@ Features
like packets and significantly reduces CPU usage under heavy Rx
load.
Large Receive Offload (LRO), or Receive Side Coalescing (RSC)
-------------------------------------------------------------
The driver supports LRO/RSC in the vSwitch feature. It reduces the per packet
processing overhead by coalescing multiple TCP segments when possible. The
feature is enabled by default on VMs running on Windows Server 2019 and
later. It may be changed by ethtool command:
ethtool -K eth0 lro on
ethtool -K eth0 lro off
SR-IOV support
--------------
Hyper-V supports SR-IOV as a hardware acceleration option. If SR-IOV

View File

@ -185,7 +185,9 @@ struct rndis_device {
/* Interface */
struct rndis_message;
struct ndis_offload_params;
struct netvsc_device;
struct netvsc_channel;
struct net_device_context;
extern u32 netvsc_ring_bytes;
@ -203,10 +205,7 @@ void netvsc_linkstatus_callback(struct net_device *net,
struct rndis_message *resp);
int netvsc_recv_callback(struct net_device *net,
struct netvsc_device *nvdev,
struct vmbus_channel *channel,
void *data, u32 len,
const struct ndis_tcp_ip_checksum_info *csum_info,
const struct ndis_pkt_8021q_info *vlan);
struct netvsc_channel *nvchan);
void netvsc_channel_cb(void *context);
int netvsc_poll(struct napi_struct *napi, int budget);
@ -220,9 +219,12 @@ void rndis_filter_device_remove(struct hv_device *dev,
struct netvsc_device *nvdev);
int rndis_filter_set_rss_param(struct rndis_device *rdev,
const u8 *key);
int rndis_filter_set_offload_params(struct net_device *ndev,
struct netvsc_device *nvdev,
struct ndis_offload_params *req_offloads);
int rndis_filter_receive(struct net_device *ndev,
struct netvsc_device *net_dev,
struct vmbus_channel *channel,
struct netvsc_channel *nvchan,
void *data, u32 buflen);
int rndis_filter_set_device_mac(struct netvsc_device *ndev,
@ -524,6 +526,8 @@ struct nvsp_2_vsc_capability {
u64 ieee8021q:1;
u64 correlation_id:1;
u64 teaming:1;
u64 vsubnetid:1;
u64 rsc:1;
};
};
} __packed;
@ -826,7 +830,7 @@ struct nvsp_message {
#define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \
NETIF_F_TSO | NETIF_F_IPV6_CSUM | \
NETIF_F_TSO6)
NETIF_F_TSO6 | NETIF_F_LRO)
#define VRSS_SEND_TAB_SIZE 16 /* must be power of 2 */
#define VRSS_CHANNEL_MAX 64
@ -852,6 +856,18 @@ struct multi_recv_comp {
u32 next; /* next entry for writing */
};
#define NVSP_RSC_MAX 562 /* Max #RSC frags in a vmbus xfer page pkt */
struct nvsc_rsc {
const struct ndis_pkt_8021q_info *vlan;
const struct ndis_tcp_ip_checksum_info *csum_info;
u8 is_last; /* last RNDIS msg in a vmtransfer_page */
u32 cnt; /* #fragments in an RSC packet */
u32 pktlen; /* Full packet length */
void *data[NVSP_RSC_MAX];
u32 len[NVSP_RSC_MAX];
};
struct netvsc_stats {
u64 packets;
u64 bytes;
@ -955,6 +971,7 @@ struct netvsc_channel {
struct multi_send_data msd;
struct multi_recv_comp mrc;
atomic_t queue_sends;
struct nvsc_rsc rsc;
struct netvsc_stats tx_stats;
struct netvsc_stats rx_stats;
@ -1136,7 +1153,8 @@ struct rndis_oobd {
/* Packet extension field contents associated with a Data message. */
struct rndis_per_packet_info {
u32 size;
u32 type;
u32 type:31;
u32 internal:1;
u32 ppi_offset;
};
@ -1157,6 +1175,25 @@ enum ndis_per_pkt_info_type {
MAX_PER_PKT_INFO
};
enum rndis_per_pkt_info_interal_type {
RNDIS_PKTINFO_ID = 1,
/* Add more memebers here */
RNDIS_PKTINFO_MAX
};
#define RNDIS_PKTINFO_SUBALLOC BIT(0)
#define RNDIS_PKTINFO_1ST_FRAG BIT(1)
#define RNDIS_PKTINFO_LAST_FRAG BIT(2)
#define RNDIS_PKTINFO_ID_V1 1
struct rndis_pktinfo_id {
u8 ver;
u8 flag;
u16 pkt_id;
};
struct ndis_pkt_8021q_info {
union {
struct {

View File

@ -542,6 +542,9 @@ static int negotiate_nvsp_ver(struct hv_device *device,
init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1;
}
if (nvsp_ver >= NVSP_PROTOCOL_VERSION_61)
init_packet->msg.v2_msg.send_ndis_config.capability.rsc = 1;
trace_nvsp_send(ndev, init_packet);
ret = vmbus_sendpacket(device->channel, init_packet,
@ -1111,11 +1114,12 @@ static void enq_receive_complete(struct net_device *ndev,
static int netvsc_receive(struct net_device *ndev,
struct netvsc_device *net_device,
struct vmbus_channel *channel,
struct netvsc_channel *nvchan,
const struct vmpacket_descriptor *desc,
const struct nvsp_message *nvsp)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
struct vmbus_channel *channel = nvchan->channel;
const struct vmtransfer_page_packet_header *vmxferpage_packet
= container_of(desc, const struct vmtransfer_page_packet_header, d);
u16 q_idx = channel->offermsg.offer.sub_channel_index;
@ -1150,6 +1154,7 @@ static int netvsc_receive(struct net_device *ndev,
int ret;
if (unlikely(offset + buflen > net_device->recv_buf_size)) {
nvchan->rsc.cnt = 0;
status = NVSP_STAT_FAIL;
netif_err(net_device_ctx, rx_err, ndev,
"Packet offset:%u + len:%u too big\n",
@ -1160,11 +1165,13 @@ static int netvsc_receive(struct net_device *ndev,
data = recv_buf + offset;
nvchan->rsc.is_last = (i == count - 1);
trace_rndis_recv(ndev, q_idx, data);
/* Pass it to the upper layer */
ret = rndis_filter_receive(ndev, net_device,
channel, data, buflen);
nvchan, data, buflen);
if (unlikely(ret != NVSP_STAT_SUCCESS))
status = NVSP_STAT_FAIL;
@ -1223,12 +1230,13 @@ static void netvsc_receive_inband(struct net_device *ndev,
}
static int netvsc_process_raw_pkt(struct hv_device *device,
struct vmbus_channel *channel,
struct netvsc_channel *nvchan,
struct netvsc_device *net_device,
struct net_device *ndev,
const struct vmpacket_descriptor *desc,
int budget)
{
struct vmbus_channel *channel = nvchan->channel;
const struct nvsp_message *nvmsg = hv_pkt_data(desc);
trace_nvsp_recv(ndev, channel, nvmsg);
@ -1240,7 +1248,7 @@ static int netvsc_process_raw_pkt(struct hv_device *device,
break;
case VM_PKT_DATA_USING_XFER_PAGES:
return netvsc_receive(ndev, net_device, channel,
return netvsc_receive(ndev, net_device, nvchan,
desc, nvmsg);
break;
@ -1284,7 +1292,7 @@ int netvsc_poll(struct napi_struct *napi, int budget)
nvchan->desc = hv_pkt_iter_first(channel);
while (nvchan->desc && work_done < budget) {
work_done += netvsc_process_raw_pkt(device, channel, net_device,
work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
ndev, nvchan->desc, budget);
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}

View File

@ -744,14 +744,16 @@ void netvsc_linkstatus_callback(struct net_device *net,
}
static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
struct napi_struct *napi,
const struct ndis_tcp_ip_checksum_info *csum_info,
const struct ndis_pkt_8021q_info *vlan,
void *data, u32 buflen)
struct netvsc_channel *nvchan)
{
struct napi_struct *napi = &nvchan->napi;
const struct ndis_pkt_8021q_info *vlan = nvchan->rsc.vlan;
const struct ndis_tcp_ip_checksum_info *csum_info =
nvchan->rsc.csum_info;
struct sk_buff *skb;
int i;
skb = napi_alloc_skb(napi, buflen);
skb = napi_alloc_skb(napi, nvchan->rsc.pktlen);
if (!skb)
return skb;
@ -759,7 +761,8 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
* Copy to skb. This copy is needed here since the memory pointed by
* hv_netvsc_packet cannot be deallocated
*/
skb_put_data(skb, data, buflen);
for (i = 0; i < nvchan->rsc.cnt; i++)
skb_put_data(skb, nvchan->rsc.data[i], nvchan->rsc.len[i]);
skb->protocol = eth_type_trans(skb, net);
@ -792,14 +795,11 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
*/
int netvsc_recv_callback(struct net_device *net,
struct netvsc_device *net_device,
struct vmbus_channel *channel,
void *data, u32 len,
const struct ndis_tcp_ip_checksum_info *csum_info,
const struct ndis_pkt_8021q_info *vlan)
struct netvsc_channel *nvchan)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct vmbus_channel *channel = nvchan->channel;
u16 q_idx = channel->offermsg.offer.sub_channel_index;
struct netvsc_channel *nvchan = &net_device->chan_table[q_idx];
struct sk_buff *skb;
struct netvsc_stats *rx_stats;
@ -807,8 +807,8 @@ int netvsc_recv_callback(struct net_device *net,
return NVSP_STAT_FAIL;
/* Allocate a skb - TODO direct I/O to pages? */
skb = netvsc_alloc_recv_skb(net, &nvchan->napi,
csum_info, vlan, data, len);
skb = netvsc_alloc_recv_skb(net, nvchan);
if (unlikely(!skb)) {
++net_device_ctx->eth_stats.rx_no_memory;
rcu_read_unlock();
@ -825,7 +825,7 @@ int netvsc_recv_callback(struct net_device *net,
rx_stats = &nvchan->rx_stats;
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->packets++;
rx_stats->bytes += len;
rx_stats->bytes += nvchan->rsc.pktlen;
if (skb->pkt_type == PACKET_BROADCAST)
++rx_stats->broadcast;
@ -1006,6 +1006,8 @@ static void netvsc_init_settings(struct net_device *dev)
ndc->speed = SPEED_UNKNOWN;
ndc->duplex = DUPLEX_FULL;
dev->features = NETIF_F_LRO;
}
static int netvsc_get_link_ksettings(struct net_device *dev,
@ -1733,6 +1735,33 @@ static int netvsc_set_ringparam(struct net_device *ndev,
return ret;
}
static int netvsc_set_features(struct net_device *ndev,
netdev_features_t features)
{
netdev_features_t change = features ^ ndev->features;
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
struct ndis_offload_params offloads;
if (!nvdev || nvdev->destroy)
return -ENODEV;
if (!(change & NETIF_F_LRO))
return 0;
memset(&offloads, 0, sizeof(struct ndis_offload_params));
if (features & NETIF_F_LRO) {
offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
} else {
offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
}
return rndis_filter_set_offload_params(ndev, nvdev, &offloads);
}
static u32 netvsc_get_msglevel(struct net_device *ndev)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
@ -1776,6 +1805,7 @@ static const struct net_device_ops device_ops = {
.ndo_start_xmit = netvsc_start_xmit,
.ndo_change_rx_flags = netvsc_change_rx_flags,
.ndo_set_rx_mode = netvsc_set_rx_mode,
.ndo_set_features = netvsc_set_features,
.ndo_change_mtu = netvsc_change_mtu,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = netvsc_set_mac_addr,

View File

@ -342,7 +342,8 @@ static void rndis_filter_receive_response(struct net_device *ndev,
* Get the Per-Packet-Info with the specified type
* return NULL if not found.
*/
static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type)
static inline void *rndis_get_ppi(struct rndis_packet *rpkt,
u32 type, u8 internal)
{
struct rndis_per_packet_info *ppi;
int len;
@ -355,7 +356,7 @@ static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type)
len = rpkt->per_pkt_info_len;
while (len > 0) {
if (ppi->type == type)
if (ppi->type == type && ppi->internal == internal)
return (void *)((ulong)ppi + ppi->ppi_offset);
len -= ppi->size;
ppi = (struct rndis_per_packet_info *)((ulong)ppi + ppi->size);
@ -364,17 +365,41 @@ static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type)
return NULL;
}
static inline
void rsc_add_data(struct netvsc_channel *nvchan,
const struct ndis_pkt_8021q_info *vlan,
const struct ndis_tcp_ip_checksum_info *csum_info,
void *data, u32 len)
{
u32 cnt = nvchan->rsc.cnt;
if (cnt) {
nvchan->rsc.pktlen += len;
} else {
nvchan->rsc.vlan = vlan;
nvchan->rsc.csum_info = csum_info;
nvchan->rsc.pktlen = len;
}
nvchan->rsc.data[cnt] = data;
nvchan->rsc.len[cnt] = len;
nvchan->rsc.cnt++;
}
static int rndis_filter_receive_data(struct net_device *ndev,
struct netvsc_device *nvdev,
struct vmbus_channel *channel,
struct netvsc_channel *nvchan,
struct rndis_message *msg,
u32 data_buflen)
{
struct rndis_packet *rndis_pkt = &msg->msg.pkt;
const struct ndis_tcp_ip_checksum_info *csum_info;
const struct ndis_pkt_8021q_info *vlan;
const struct rndis_pktinfo_id *pktinfo_id;
u32 data_offset;
void *data;
bool rsc_more = false;
int ret;
/* Remove the rndis header and pass it back up the stack */
data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset;
@ -393,25 +418,59 @@ static int rndis_filter_receive_data(struct net_device *ndev,
return NVSP_STAT_FAIL;
}
vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO);
vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO, 0);
csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO, 0);
pktinfo_id = rndis_get_ppi(rndis_pkt, RNDIS_PKTINFO_ID, 1);
data = (void *)msg + data_offset;
/*
* Remove the rndis trailer padding from rndis packet message
/* Identify RSC frags, drop erroneous packets */
if (pktinfo_id && (pktinfo_id->flag & RNDIS_PKTINFO_SUBALLOC)) {
if (pktinfo_id->flag & RNDIS_PKTINFO_1ST_FRAG)
nvchan->rsc.cnt = 0;
else if (nvchan->rsc.cnt == 0)
goto drop;
rsc_more = true;
if (pktinfo_id->flag & RNDIS_PKTINFO_LAST_FRAG)
rsc_more = false;
if (rsc_more && nvchan->rsc.is_last)
goto drop;
} else {
nvchan->rsc.cnt = 0;
}
if (unlikely(nvchan->rsc.cnt >= NVSP_RSC_MAX))
goto drop;
/* Put data into per channel structure.
* Also, remove the rndis trailer padding from rndis packet message
* rndis_pkt->data_len tell us the real data length, we only copy
* the data packet to the stack, without the rndis trailer padding
*/
return netvsc_recv_callback(ndev, nvdev, channel,
data, rndis_pkt->data_len,
csum_info, vlan);
rsc_add_data(nvchan, vlan, csum_info, data, rndis_pkt->data_len);
if (rsc_more)
return NVSP_STAT_SUCCESS;
ret = netvsc_recv_callback(ndev, nvdev, nvchan);
nvchan->rsc.cnt = 0;
return ret;
drop:
/* Drop incomplete packet */
nvchan->rsc.cnt = 0;
return NVSP_STAT_FAIL;
}
int rndis_filter_receive(struct net_device *ndev,
struct netvsc_device *net_dev,
struct vmbus_channel *channel,
struct netvsc_channel *nvchan,
void *data, u32 buflen)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
@ -422,7 +481,7 @@ int rndis_filter_receive(struct net_device *ndev,
switch (rndis_msg->ndis_msg_type) {
case RNDIS_MSG_PACKET:
return rndis_filter_receive_data(ndev, net_dev, channel,
return rndis_filter_receive_data(ndev, net_dev, nvchan,
rndis_msg, buflen);
case RNDIS_MSG_INIT_C:
case RNDIS_MSG_QUERY_C:
@ -657,7 +716,7 @@ int rndis_filter_set_device_mac(struct netvsc_device *nvdev,
return ret;
}
static int
int
rndis_filter_set_offload_params(struct net_device *ndev,
struct netvsc_device *nvdev,
struct ndis_offload_params *req_offloads)
@ -1184,6 +1243,18 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
}
}
if (hwcaps.rsc.ip4 && hwcaps.rsc.ip6) {
net->hw_features |= NETIF_F_LRO;
if (net->features & NETIF_F_LRO) {
offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
} else {
offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
}
}
/* In case some hw_features disappeared we need to remove them from
* net->features list as they're no longer supported.
*/