linux_dsm_epyc7002/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c

372 lines
8.7 KiB
C
Raw Normal View History

// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/*
* Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
* stmmac XGMAC support.
*/
#include <linux/stmmac.h>
#include "common.h"
#include "dwxgmac2.h"
static int dwxgmac2_get_tx_status(void *data, struct stmmac_extra_stats *x,
struct dma_desc *p, void __iomem *ioaddr)
{
unsigned int tdes3 = le32_to_cpu(p->des3);
int ret = tx_done;
if (unlikely(tdes3 & XGMAC_TDES3_OWN))
return tx_dma_own;
if (likely(!(tdes3 & XGMAC_TDES3_LD)))
return tx_not_ls;
return ret;
}
static int dwxgmac2_get_rx_status(void *data, struct stmmac_extra_stats *x,
struct dma_desc *p)
{
unsigned int rdes3 = le32_to_cpu(p->des3);
if (unlikely(rdes3 & XGMAC_RDES3_OWN))
return dma_own;
net: stmmac: Add Split Header support and enable it in XGMAC cores Add the support for Split Header feature in the RX path and enable it in XGMAC cores. This does not impact neither beneficts bandwidth but it does reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is. With Split Header disabled 'perf stat -d' gives: 86870.624945 task-clock (msec) # 0.429 CPUs utilized 1073352 context-switches # 0.012 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.002 K/sec 327113872376 cycles # 3.766 GHz (62.53%) 56618161216 instructions # 0.17 insn per cycle (75.06%) 10742205071 branches # 123.658 M/sec (75.36%) 584309242 branch-misses # 5.44% of all branches (75.19%) 17594787965 L1-dcache-loads # 202.540 M/sec (74.88%) 4003773131 L1-dcache-load-misses # 22.76% of all L1-dcache hits (74.89%) 1313301468 LLC-loads # 15.118 M/sec (49.75%) 355906510 LLC-load-misses # 27.10% of all LL-cache hits (49.92%) With Split Header enabled 'perf stat -d' gives: 49324.456539 task-clock (msec) # 0.245 CPUs utilized 2542387 context-switches # 0.052 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.004 K/sec 177092791469 cycles # 3.590 GHz (62.30%) 68555756017 instructions # 0.39 insn per cycle (75.16%) 12697019382 branches # 257.418 M/sec (74.81%) 442081897 branch-misses # 3.48% of all branches (74.79%) 20337958358 L1-dcache-loads # 412.330 M/sec (75.46%) 3820210140 L1-dcache-load-misses # 18.78% of all L1-dcache hits (75.35%) 1257719198 LLC-loads # 25.499 M/sec (49.73%) 685543923 LLC-load-misses # 54.51% of all LL-cache hits (49.86%) Changes from v2: - Reword commit message (Jakub) Changes from v1: - Add performance info (David) - Add misssing dma_sync_single_for_device() Signed-off-by: Jose Abreu <joabreu@synopsys.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-18 01:54:43 +07:00
if (unlikely(rdes3 & XGMAC_RDES3_CTXT))
return discard_frame;
if (likely(!(rdes3 & XGMAC_RDES3_LD)))
return rx_not_ls;
if (unlikely((rdes3 & XGMAC_RDES3_ES) && (rdes3 & XGMAC_RDES3_LD)))
return discard_frame;
return good_frame;
}
static int dwxgmac2_get_tx_len(struct dma_desc *p)
{
return (le32_to_cpu(p->des2) & XGMAC_TDES2_B1L);
}
static int dwxgmac2_get_tx_owner(struct dma_desc *p)
{
return (le32_to_cpu(p->des3) & XGMAC_TDES3_OWN) > 0;
}
static void dwxgmac2_set_tx_owner(struct dma_desc *p)
{
p->des3 |= cpu_to_le32(XGMAC_TDES3_OWN);
}
static void dwxgmac2_set_rx_owner(struct dma_desc *p, int disable_rx_ic)
{
net: stmmac: Add Split Header support and enable it in XGMAC cores Add the support for Split Header feature in the RX path and enable it in XGMAC cores. This does not impact neither beneficts bandwidth but it does reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is. With Split Header disabled 'perf stat -d' gives: 86870.624945 task-clock (msec) # 0.429 CPUs utilized 1073352 context-switches # 0.012 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.002 K/sec 327113872376 cycles # 3.766 GHz (62.53%) 56618161216 instructions # 0.17 insn per cycle (75.06%) 10742205071 branches # 123.658 M/sec (75.36%) 584309242 branch-misses # 5.44% of all branches (75.19%) 17594787965 L1-dcache-loads # 202.540 M/sec (74.88%) 4003773131 L1-dcache-load-misses # 22.76% of all L1-dcache hits (74.89%) 1313301468 LLC-loads # 15.118 M/sec (49.75%) 355906510 LLC-load-misses # 27.10% of all LL-cache hits (49.92%) With Split Header enabled 'perf stat -d' gives: 49324.456539 task-clock (msec) # 0.245 CPUs utilized 2542387 context-switches # 0.052 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.004 K/sec 177092791469 cycles # 3.590 GHz (62.30%) 68555756017 instructions # 0.39 insn per cycle (75.16%) 12697019382 branches # 257.418 M/sec (74.81%) 442081897 branch-misses # 3.48% of all branches (74.79%) 20337958358 L1-dcache-loads # 412.330 M/sec (75.46%) 3820210140 L1-dcache-load-misses # 18.78% of all L1-dcache hits (75.35%) 1257719198 LLC-loads # 25.499 M/sec (49.73%) 685543923 LLC-load-misses # 54.51% of all LL-cache hits (49.86%) Changes from v2: - Reword commit message (Jakub) Changes from v1: - Add performance info (David) - Add misssing dma_sync_single_for_device() Signed-off-by: Jose Abreu <joabreu@synopsys.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-18 01:54:43 +07:00
p->des3 |= cpu_to_le32(XGMAC_RDES3_OWN);
if (!disable_rx_ic)
p->des3 |= cpu_to_le32(XGMAC_RDES3_IOC);
}
static int dwxgmac2_get_tx_ls(struct dma_desc *p)
{
return (le32_to_cpu(p->des3) & XGMAC_RDES3_LD) > 0;
}
static int dwxgmac2_get_rx_frame_len(struct dma_desc *p, int rx_coe)
{
return (le32_to_cpu(p->des3) & XGMAC_RDES3_PL);
}
static void dwxgmac2_enable_tx_timestamp(struct dma_desc *p)
{
p->des2 |= cpu_to_le32(XGMAC_TDES2_TTSE);
}
static int dwxgmac2_get_tx_timestamp_status(struct dma_desc *p)
{
return 0; /* Not supported */
}
static inline void dwxgmac2_get_timestamp(void *desc, u32 ats, u64 *ts)
{
struct dma_desc *p = (struct dma_desc *)desc;
u64 ns = 0;
ns += le32_to_cpu(p->des1) * 1000000000ULL;
ns += le32_to_cpu(p->des0);
*ts = ns;
}
static int dwxgmac2_rx_check_timestamp(void *desc)
{
struct dma_desc *p = (struct dma_desc *)desc;
unsigned int rdes3 = le32_to_cpu(p->des3);
bool desc_valid, ts_valid;
dma_rmb();
desc_valid = !(rdes3 & XGMAC_RDES3_OWN) && (rdes3 & XGMAC_RDES3_CTXT);
ts_valid = !(rdes3 & XGMAC_RDES3_TSD) && (rdes3 & XGMAC_RDES3_TSA);
if (likely(desc_valid && ts_valid)) {
if ((p->des0 == 0xffffffff) && (p->des1 == 0xffffffff))
return -EINVAL;
return 0;
}
return -EINVAL;
}
static int dwxgmac2_get_rx_timestamp_status(void *desc, void *next_desc,
u32 ats)
{
struct dma_desc *p = (struct dma_desc *)desc;
unsigned int rdes3 = le32_to_cpu(p->des3);
int ret = -EBUSY;
if (likely(rdes3 & XGMAC_RDES3_CDA))
ret = dwxgmac2_rx_check_timestamp(next_desc);
return !ret;
}
static void dwxgmac2_init_rx_desc(struct dma_desc *p, int disable_rx_ic,
int mode, int end, int bfsize)
{
dwxgmac2_set_rx_owner(p, disable_rx_ic);
}
static void dwxgmac2_init_tx_desc(struct dma_desc *p, int mode, int end)
{
p->des0 = 0;
p->des1 = 0;
p->des2 = 0;
p->des3 = 0;
}
static void dwxgmac2_prepare_tx_desc(struct dma_desc *p, int is_fs, int len,
bool csum_flag, int mode, bool tx_own,
bool ls, unsigned int tot_pkt_len)
{
unsigned int tdes3 = le32_to_cpu(p->des3);
p->des2 |= cpu_to_le32(len & XGMAC_TDES2_B1L);
tdes3 |= tot_pkt_len & XGMAC_TDES3_FL;
if (is_fs)
tdes3 |= XGMAC_TDES3_FD;
else
tdes3 &= ~XGMAC_TDES3_FD;
if (csum_flag)
tdes3 |= 0x3 << XGMAC_TDES3_CIC_SHIFT;
else
tdes3 &= ~XGMAC_TDES3_CIC;
if (ls)
tdes3 |= XGMAC_TDES3_LD;
else
tdes3 &= ~XGMAC_TDES3_LD;
/* Finally set the OWN bit. Later the DMA will start! */
if (tx_own)
tdes3 |= XGMAC_TDES3_OWN;
if (is_fs && tx_own)
/* When the own bit, for the first frame, has to be set, all
* descriptors for the same frame has to be set before, to
* avoid race condition.
*/
dma_wmb();
p->des3 = cpu_to_le32(tdes3);
}
static void dwxgmac2_prepare_tso_tx_desc(struct dma_desc *p, int is_fs,
int len1, int len2, bool tx_own,
bool ls, unsigned int tcphdrlen,
unsigned int tcppayloadlen)
{
unsigned int tdes3 = le32_to_cpu(p->des3);
if (len1)
p->des2 |= cpu_to_le32(len1 & XGMAC_TDES2_B1L);
if (len2)
p->des2 |= cpu_to_le32((len2 << XGMAC_TDES2_B2L_SHIFT) &
XGMAC_TDES2_B2L);
if (is_fs) {
tdes3 |= XGMAC_TDES3_FD | XGMAC_TDES3_TSE;
tdes3 |= (tcphdrlen << XGMAC_TDES3_THL_SHIFT) &
XGMAC_TDES3_THL;
tdes3 |= tcppayloadlen & XGMAC_TDES3_TPL;
} else {
tdes3 &= ~XGMAC_TDES3_FD;
}
if (ls)
tdes3 |= XGMAC_TDES3_LD;
else
tdes3 &= ~XGMAC_TDES3_LD;
/* Finally set the OWN bit. Later the DMA will start! */
if (tx_own)
tdes3 |= XGMAC_TDES3_OWN;
if (is_fs && tx_own)
/* When the own bit, for the first frame, has to be set, all
* descriptors for the same frame has to be set before, to
* avoid race condition.
*/
dma_wmb();
p->des3 = cpu_to_le32(tdes3);
}
static void dwxgmac2_release_tx_desc(struct dma_desc *p, int mode)
{
p->des0 = 0;
p->des1 = 0;
p->des2 = 0;
p->des3 = 0;
}
static void dwxgmac2_set_tx_ic(struct dma_desc *p)
{
p->des2 |= cpu_to_le32(XGMAC_TDES2_IOC);
}
static void dwxgmac2_set_mss(struct dma_desc *p, unsigned int mss)
{
p->des0 = 0;
p->des1 = 0;
p->des2 = cpu_to_le32(mss);
p->des3 = cpu_to_le32(XGMAC_TDES3_CTXT | XGMAC_TDES3_TCMSSV);
}
static void dwxgmac2_get_addr(struct dma_desc *p, unsigned int *addr)
{
*addr = le32_to_cpu(p->des0);
}
static void dwxgmac2_set_addr(struct dma_desc *p, dma_addr_t addr)
{
p->des0 = cpu_to_le32(lower_32_bits(addr));
p->des1 = cpu_to_le32(upper_32_bits(addr));
}
static void dwxgmac2_clear(struct dma_desc *p)
{
p->des0 = 0;
p->des1 = 0;
p->des2 = 0;
p->des3 = 0;
}
static int dwxgmac2_get_rx_hash(struct dma_desc *p, u32 *hash,
enum pkt_hash_types *type)
{
unsigned int rdes3 = le32_to_cpu(p->des3);
u32 ptype;
if (rdes3 & XGMAC_RDES3_RSV) {
ptype = (rdes3 & XGMAC_RDES3_L34T) >> XGMAC_RDES3_L34T_SHIFT;
switch (ptype) {
case XGMAC_L34T_IP4TCP:
case XGMAC_L34T_IP4UDP:
case XGMAC_L34T_IP6TCP:
case XGMAC_L34T_IP6UDP:
*type = PKT_HASH_TYPE_L4;
break;
default:
*type = PKT_HASH_TYPE_L3;
break;
}
*hash = le32_to_cpu(p->des1);
return 0;
}
return -EINVAL;
}
net: stmmac: Add Split Header support and enable it in XGMAC cores Add the support for Split Header feature in the RX path and enable it in XGMAC cores. This does not impact neither beneficts bandwidth but it does reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is. With Split Header disabled 'perf stat -d' gives: 86870.624945 task-clock (msec) # 0.429 CPUs utilized 1073352 context-switches # 0.012 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.002 K/sec 327113872376 cycles # 3.766 GHz (62.53%) 56618161216 instructions # 0.17 insn per cycle (75.06%) 10742205071 branches # 123.658 M/sec (75.36%) 584309242 branch-misses # 5.44% of all branches (75.19%) 17594787965 L1-dcache-loads # 202.540 M/sec (74.88%) 4003773131 L1-dcache-load-misses # 22.76% of all L1-dcache hits (74.89%) 1313301468 LLC-loads # 15.118 M/sec (49.75%) 355906510 LLC-load-misses # 27.10% of all LL-cache hits (49.92%) With Split Header enabled 'perf stat -d' gives: 49324.456539 task-clock (msec) # 0.245 CPUs utilized 2542387 context-switches # 0.052 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.004 K/sec 177092791469 cycles # 3.590 GHz (62.30%) 68555756017 instructions # 0.39 insn per cycle (75.16%) 12697019382 branches # 257.418 M/sec (74.81%) 442081897 branch-misses # 3.48% of all branches (74.79%) 20337958358 L1-dcache-loads # 412.330 M/sec (75.46%) 3820210140 L1-dcache-load-misses # 18.78% of all L1-dcache hits (75.35%) 1257719198 LLC-loads # 25.499 M/sec (49.73%) 685543923 LLC-load-misses # 54.51% of all LL-cache hits (49.86%) Changes from v2: - Reword commit message (Jakub) Changes from v1: - Add performance info (David) - Add misssing dma_sync_single_for_device() Signed-off-by: Jose Abreu <joabreu@synopsys.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-18 01:54:43 +07:00
static int dwxgmac2_get_rx_header_len(struct dma_desc *p, unsigned int *len)
{
if (le32_to_cpu(p->des3) & XGMAC_RDES3_L34T)
*len = le32_to_cpu(p->des2) & XGMAC_RDES2_HL;
net: stmmac: Add Split Header support and enable it in XGMAC cores Add the support for Split Header feature in the RX path and enable it in XGMAC cores. This does not impact neither beneficts bandwidth but it does reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is. With Split Header disabled 'perf stat -d' gives: 86870.624945 task-clock (msec) # 0.429 CPUs utilized 1073352 context-switches # 0.012 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.002 K/sec 327113872376 cycles # 3.766 GHz (62.53%) 56618161216 instructions # 0.17 insn per cycle (75.06%) 10742205071 branches # 123.658 M/sec (75.36%) 584309242 branch-misses # 5.44% of all branches (75.19%) 17594787965 L1-dcache-loads # 202.540 M/sec (74.88%) 4003773131 L1-dcache-load-misses # 22.76% of all L1-dcache hits (74.89%) 1313301468 LLC-loads # 15.118 M/sec (49.75%) 355906510 LLC-load-misses # 27.10% of all LL-cache hits (49.92%) With Split Header enabled 'perf stat -d' gives: 49324.456539 task-clock (msec) # 0.245 CPUs utilized 2542387 context-switches # 0.052 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.004 K/sec 177092791469 cycles # 3.590 GHz (62.30%) 68555756017 instructions # 0.39 insn per cycle (75.16%) 12697019382 branches # 257.418 M/sec (74.81%) 442081897 branch-misses # 3.48% of all branches (74.79%) 20337958358 L1-dcache-loads # 412.330 M/sec (75.46%) 3820210140 L1-dcache-load-misses # 18.78% of all L1-dcache hits (75.35%) 1257719198 LLC-loads # 25.499 M/sec (49.73%) 685543923 LLC-load-misses # 54.51% of all LL-cache hits (49.86%) Changes from v2: - Reword commit message (Jakub) Changes from v1: - Add performance info (David) - Add misssing dma_sync_single_for_device() Signed-off-by: Jose Abreu <joabreu@synopsys.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-18 01:54:43 +07:00
return 0;
}
static void dwxgmac2_set_sec_addr(struct dma_desc *p, dma_addr_t addr)
{
p->des2 = cpu_to_le32(lower_32_bits(addr));
p->des3 = cpu_to_le32(upper_32_bits(addr));
}
static void dwxgmac2_set_sarc(struct dma_desc *p, u32 sarc_type)
{
sarc_type <<= XGMAC_TDES3_SAIC_SHIFT;
p->des3 |= cpu_to_le32(sarc_type & XGMAC_TDES3_SAIC);
}
static void dwxgmac2_set_vlan_tag(struct dma_desc *p, u16 tag, u16 inner_tag,
u32 inner_type)
{
p->des0 = 0;
p->des1 = 0;
p->des2 = 0;
p->des3 = 0;
/* Inner VLAN */
if (inner_type) {
u32 des = inner_tag << XGMAC_TDES2_IVT_SHIFT;
des &= XGMAC_TDES2_IVT;
p->des2 = cpu_to_le32(des);
des = inner_type << XGMAC_TDES3_IVTIR_SHIFT;
des &= XGMAC_TDES3_IVTIR;
p->des3 = cpu_to_le32(des | XGMAC_TDES3_IVLTV);
}
/* Outer VLAN */
p->des3 |= cpu_to_le32(tag & XGMAC_TDES3_VT);
p->des3 |= cpu_to_le32(XGMAC_TDES3_VLTV);
p->des3 |= cpu_to_le32(XGMAC_TDES3_CTXT);
}
static void dwxgmac2_set_vlan(struct dma_desc *p, u32 type)
{
type <<= XGMAC_TDES2_VTIR_SHIFT;
p->des2 |= cpu_to_le32(type & XGMAC_TDES2_VTIR);
}
const struct stmmac_desc_ops dwxgmac210_desc_ops = {
.tx_status = dwxgmac2_get_tx_status,
.rx_status = dwxgmac2_get_rx_status,
.get_tx_len = dwxgmac2_get_tx_len,
.get_tx_owner = dwxgmac2_get_tx_owner,
.set_tx_owner = dwxgmac2_set_tx_owner,
.set_rx_owner = dwxgmac2_set_rx_owner,
.get_tx_ls = dwxgmac2_get_tx_ls,
.get_rx_frame_len = dwxgmac2_get_rx_frame_len,
.enable_tx_timestamp = dwxgmac2_enable_tx_timestamp,
.get_tx_timestamp_status = dwxgmac2_get_tx_timestamp_status,
.get_rx_timestamp_status = dwxgmac2_get_rx_timestamp_status,
.get_timestamp = dwxgmac2_get_timestamp,
.set_tx_ic = dwxgmac2_set_tx_ic,
.prepare_tx_desc = dwxgmac2_prepare_tx_desc,
.prepare_tso_tx_desc = dwxgmac2_prepare_tso_tx_desc,
.release_tx_desc = dwxgmac2_release_tx_desc,
.init_rx_desc = dwxgmac2_init_rx_desc,
.init_tx_desc = dwxgmac2_init_tx_desc,
.set_mss = dwxgmac2_set_mss,
.get_addr = dwxgmac2_get_addr,
.set_addr = dwxgmac2_set_addr,
.clear = dwxgmac2_clear,
.get_rx_hash = dwxgmac2_get_rx_hash,
net: stmmac: Add Split Header support and enable it in XGMAC cores Add the support for Split Header feature in the RX path and enable it in XGMAC cores. This does not impact neither beneficts bandwidth but it does reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is. With Split Header disabled 'perf stat -d' gives: 86870.624945 task-clock (msec) # 0.429 CPUs utilized 1073352 context-switches # 0.012 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.002 K/sec 327113872376 cycles # 3.766 GHz (62.53%) 56618161216 instructions # 0.17 insn per cycle (75.06%) 10742205071 branches # 123.658 M/sec (75.36%) 584309242 branch-misses # 5.44% of all branches (75.19%) 17594787965 L1-dcache-loads # 202.540 M/sec (74.88%) 4003773131 L1-dcache-load-misses # 22.76% of all L1-dcache hits (74.89%) 1313301468 LLC-loads # 15.118 M/sec (49.75%) 355906510 LLC-load-misses # 27.10% of all LL-cache hits (49.92%) With Split Header enabled 'perf stat -d' gives: 49324.456539 task-clock (msec) # 0.245 CPUs utilized 2542387 context-switches # 0.052 M/sec 1 cpu-migrations # 0.000 K/sec 213 page-faults # 0.004 K/sec 177092791469 cycles # 3.590 GHz (62.30%) 68555756017 instructions # 0.39 insn per cycle (75.16%) 12697019382 branches # 257.418 M/sec (74.81%) 442081897 branch-misses # 3.48% of all branches (74.79%) 20337958358 L1-dcache-loads # 412.330 M/sec (75.46%) 3820210140 L1-dcache-load-misses # 18.78% of all L1-dcache hits (75.35%) 1257719198 LLC-loads # 25.499 M/sec (49.73%) 685543923 LLC-load-misses # 54.51% of all LL-cache hits (49.86%) Changes from v2: - Reword commit message (Jakub) Changes from v1: - Add performance info (David) - Add misssing dma_sync_single_for_device() Signed-off-by: Jose Abreu <joabreu@synopsys.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-18 01:54:43 +07:00
.get_rx_header_len = dwxgmac2_get_rx_header_len,
.set_sec_addr = dwxgmac2_set_sec_addr,
.set_sarc = dwxgmac2_set_sarc,
.set_vlan_tag = dwxgmac2_set_vlan_tag,
.set_vlan = dwxgmac2_set_vlan,
};