mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
15c2d45d17
I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
167 lines
2.5 KiB
ArmAsm
167 lines
2.5 KiB
ArmAsm
/*
|
|
* String handling functions for PowerPC.
|
|
*
|
|
* Copyright (C) 1996 Paul Mackerras.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
#include <asm/processor.h>
|
|
#include <asm/errno.h>
|
|
#include <asm/ppc_asm.h>
|
|
|
|
.section __ex_table,"a"
|
|
PPC_LONG_ALIGN
|
|
.text
|
|
|
|
_GLOBAL(strcpy)
|
|
addi r5,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r0,1(r4)
|
|
cmpwi 0,r0,0
|
|
stbu r0,1(r5)
|
|
bne 1b
|
|
blr
|
|
|
|
/* This clears out any unused part of the destination buffer,
|
|
just as the libc version does. -- paulus */
|
|
_GLOBAL(strncpy)
|
|
PPC_LCMPI 0,r5,0
|
|
beqlr
|
|
mtctr r5
|
|
addi r6,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r0,1(r4)
|
|
cmpwi 0,r0,0
|
|
stbu r0,1(r6)
|
|
bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */
|
|
bnelr /* if we didn't hit a null char, we're done */
|
|
mfctr r5
|
|
PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */
|
|
beqlr /* we know r0 == 0 here */
|
|
2: stbu r0,1(r6) /* clear it out if so */
|
|
bdnz 2b
|
|
blr
|
|
|
|
_GLOBAL(strcat)
|
|
addi r5,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r0,1(r5)
|
|
cmpwi 0,r0,0
|
|
bne 1b
|
|
addi r5,r5,-1
|
|
1: lbzu r0,1(r4)
|
|
cmpwi 0,r0,0
|
|
stbu r0,1(r5)
|
|
bne 1b
|
|
blr
|
|
|
|
_GLOBAL(strcmp)
|
|
addi r5,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r3,1(r5)
|
|
cmpwi 1,r3,0
|
|
lbzu r0,1(r4)
|
|
subf. r3,r0,r3
|
|
beqlr 1
|
|
beq 1b
|
|
blr
|
|
|
|
_GLOBAL(strncmp)
|
|
PPC_LCMPI 0,r5,0
|
|
beq- 2f
|
|
mtctr r5
|
|
addi r5,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r3,1(r5)
|
|
cmpwi 1,r3,0
|
|
lbzu r0,1(r4)
|
|
subf. r3,r0,r3
|
|
beqlr 1
|
|
bdnzt eq,1b
|
|
blr
|
|
2: li r3,0
|
|
blr
|
|
|
|
_GLOBAL(strlen)
|
|
addi r4,r3,-1
|
|
1: lbzu r0,1(r4)
|
|
cmpwi 0,r0,0
|
|
bne 1b
|
|
subf r3,r3,r4
|
|
blr
|
|
|
|
#ifdef CONFIG_PPC32
|
|
_GLOBAL(memcmp)
|
|
PPC_LCMPI 0,r5,0
|
|
beq- 2f
|
|
mtctr r5
|
|
addi r6,r3,-1
|
|
addi r4,r4,-1
|
|
1: lbzu r3,1(r6)
|
|
lbzu r0,1(r4)
|
|
subf. r3,r0,r3
|
|
bdnzt 2,1b
|
|
blr
|
|
2: li r3,0
|
|
blr
|
|
#endif
|
|
|
|
_GLOBAL(memchr)
|
|
PPC_LCMPI 0,r5,0
|
|
beq- 2f
|
|
mtctr r5
|
|
addi r3,r3,-1
|
|
1: lbzu r0,1(r3)
|
|
cmpw 0,r0,r4
|
|
bdnzf 2,1b
|
|
beqlr
|
|
2: li r3,0
|
|
blr
|
|
|
|
#ifdef CONFIG_PPC32
|
|
_GLOBAL(__clear_user)
|
|
addi r6,r3,-4
|
|
li r3,0
|
|
li r5,0
|
|
cmplwi 0,r4,4
|
|
blt 7f
|
|
/* clear a single word */
|
|
11: stwu r5,4(r6)
|
|
beqlr
|
|
/* clear word sized chunks */
|
|
andi. r0,r6,3
|
|
add r4,r0,r4
|
|
subf r6,r0,r6
|
|
srwi r0,r4,2
|
|
andi. r4,r4,3
|
|
mtctr r0
|
|
bdz 7f
|
|
1: stwu r5,4(r6)
|
|
bdnz 1b
|
|
/* clear byte sized chunks */
|
|
7: cmpwi 0,r4,0
|
|
beqlr
|
|
mtctr r4
|
|
addi r6,r6,3
|
|
8: stbu r5,1(r6)
|
|
bdnz 8b
|
|
blr
|
|
90: mr r3,r4
|
|
blr
|
|
91: mfctr r3
|
|
slwi r3,r3,2
|
|
add r3,r3,r4
|
|
blr
|
|
92: mfctr r3
|
|
blr
|
|
|
|
.section __ex_table,"a"
|
|
PPC_LONG 11b,90b
|
|
PPC_LONG 1b,91b
|
|
PPC_LONG 8b,92b
|
|
.text
|
|
#endif
|