mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-16 17:46:43 +07:00
nfp: bpf: optimize the RMW for stack accesses
When we are performing unaligned stack accesses in the 32-64B window we have to do a read-modify-write cycle. E.g. for reading 8 bytes from address 17: 0: tmp = stack[16] 1: gprLo = tmp >> 8 2: tmp = stack[20] 3: gprLo |= tmp << 24 4: tmp = stack[20] 5: gprHi = tmp >> 8 6: tmp = stack[24] 7: gprHi |= tmp << 24 The load on line 4 is unnecessary, because tmp already contains data from stack[20]. For write we can optimize both loads and writebacks away. Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
a82b23fb38
commit
9a90c83c09
@ -644,11 +644,11 @@ data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
|
|||||||
|
|
||||||
typedef int
|
typedef int
|
||||||
(*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
|
(*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
|
||||||
unsigned int size, bool new_gpr);
|
unsigned int size, bool first, bool new_gpr, bool last);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
|
wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
|
||||||
unsigned int size, bool new_gpr)
|
unsigned int size, bool first, bool new_gpr, bool last)
|
||||||
{
|
{
|
||||||
u32 idx, src_byte;
|
u32 idx, src_byte;
|
||||||
enum shf_sc sc;
|
enum shf_sc sc;
|
||||||
@ -692,7 +692,13 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
|
|||||||
reg = reg_lm(0, idx);
|
reg = reg_lm(0, idx);
|
||||||
} else {
|
} else {
|
||||||
reg = imm_a(nfp_prog);
|
reg = imm_a(nfp_prog);
|
||||||
wrp_mov(nfp_prog, reg, reg_lm(0, idx));
|
/* If it's not the first part of the load and we start a new GPR
|
||||||
|
* that means we are loading a second part of the LMEM word into
|
||||||
|
* a new GPR. IOW we've already looked that LMEM word and
|
||||||
|
* therefore it has been loaded into imm_a().
|
||||||
|
*/
|
||||||
|
if (first || !new_gpr)
|
||||||
|
wrp_mov(nfp_prog, reg, reg_lm(0, idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
|
emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
|
||||||
@ -702,7 +708,7 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
|
wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
|
||||||
unsigned int size, bool new_gpr)
|
unsigned int size, bool first, bool new_gpr, bool last)
|
||||||
{
|
{
|
||||||
u32 idx, dst_byte;
|
u32 idx, dst_byte;
|
||||||
enum shf_sc sc;
|
enum shf_sc sc;
|
||||||
@ -746,13 +752,19 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
|
|||||||
reg = reg_lm(0, idx);
|
reg = reg_lm(0, idx);
|
||||||
} else {
|
} else {
|
||||||
reg = imm_a(nfp_prog);
|
reg = imm_a(nfp_prog);
|
||||||
wrp_mov(nfp_prog, reg, reg_lm(0, idx));
|
/* Only first and last LMEM locations are going to need RMW,
|
||||||
|
* the middle location will be overwritten fully.
|
||||||
|
*/
|
||||||
|
if (first || last)
|
||||||
|
wrp_mov(nfp_prog, reg, reg_lm(0, idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
|
emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
|
||||||
|
|
||||||
if (idx > RE_REG_LM_IDX_MAX)
|
if (new_gpr || last) {
|
||||||
wrp_mov(nfp_prog, reg_lm(0, idx), reg);
|
if (idx > RE_REG_LM_IDX_MAX)
|
||||||
|
wrp_mov(nfp_prog, reg_lm(0, idx), reg);
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -762,6 +774,7 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
|
|||||||
unsigned int size, u8 gpr, bool clr_gpr, lmem_step step)
|
unsigned int size, u8 gpr, bool clr_gpr, lmem_step step)
|
||||||
{
|
{
|
||||||
s32 off = nfp_prog->stack_depth + meta->insn.off;
|
s32 off = nfp_prog->stack_depth + meta->insn.off;
|
||||||
|
bool first = true, last;
|
||||||
u8 prev_gpr = 255;
|
u8 prev_gpr = 255;
|
||||||
u32 gpr_byte = 0;
|
u32 gpr_byte = 0;
|
||||||
int ret;
|
int ret;
|
||||||
@ -777,12 +790,16 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
|
|||||||
slice_end = min(off + slice_size, round_up(off + 1, 4));
|
slice_end = min(off + slice_size, round_up(off + 1, 4));
|
||||||
slice_size = slice_end - off;
|
slice_size = slice_end - off;
|
||||||
|
|
||||||
|
last = slice_size == size;
|
||||||
|
|
||||||
ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
|
ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
|
||||||
gpr != prev_gpr);
|
first, gpr != prev_gpr, last);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
prev_gpr = gpr;
|
prev_gpr = gpr;
|
||||||
|
first = false;
|
||||||
|
|
||||||
gpr_byte += slice_size;
|
gpr_byte += slice_size;
|
||||||
if (gpr_byte >= 4) {
|
if (gpr_byte >= 4) {
|
||||||
gpr_byte -= 4;
|
gpr_byte -= 4;
|
||||||
|
Loading…
Reference in New Issue
Block a user