mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-25 00:40:56 +07:00
powerpc/lib: Implement strlen() in assembly for PPC32
The generic implementation of strlen() reads strings byte per byte. This patch implements strlen() in assembly based on a read of entire words, in the same spirit as what some other arches and glibc do. On a 8xx the time spent in strlen is reduced by 3/4 for long strings. strlen() selftest on an 8xx provides the following values: Before the patch (ie with the generic strlen() in lib/string.c): len 256 : time = 1.195055 len 016 : time = 0.083745 len 008 : time = 0.046828 len 004 : time = 0.028390 After the patch: len 256 : time = 0.272185 ==> 78% improvment len 016 : time = 0.040632 ==> 51% improvment len 008 : time = 0.033060 ==> 29% improvment len 004 : time = 0.029149 ==> 2% degradation On a 832x: Before the patch: len 256 : time = 0.236125 len 016 : time = 0.018136 len 008 : time = 0.011000 len 004 : time = 0.007229 After the patch: len 256 : time = 0.094950 ==> 60% improvment len 016 : time = 0.013357 ==> 26% improvment len 008 : time = 0.010586 ==> 4% improvment len 004 : time = 0.008784 Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
f0abbfd89f
commit
9412b23450
@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
|
||||
return __memset64(p, v, n * 8);
|
||||
}
|
||||
#else
|
||||
#define __HAVE_ARCH_STRLEN
|
||||
|
||||
extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
|
||||
#endif
|
||||
#endif /* __KERNEL__ */
|
||||
|
@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
|
||||
|
||||
obj-y += string.o alloc.o code-patching.o feature-fixups.o
|
||||
|
||||
obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
|
||||
obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o
|
||||
|
||||
# See corresponding test in arch/powerpc/Makefile
|
||||
# 64-bit linker creates .sfpr on demand for final link (vmlinux),
|
||||
|
78
arch/powerpc/lib/strlen_32.S
Normal file
78
arch/powerpc/lib/strlen_32.S
Normal file
@ -0,0 +1,78 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* strlen() for PPC32
|
||||
*
|
||||
* Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
|
||||
*
|
||||
* Inspired from glibc implementation
|
||||
*/
|
||||
#include <asm/ppc_asm.h>
|
||||
#include <asm/export.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
.text
|
||||
|
||||
/*
|
||||
* Algorithm:
|
||||
*
|
||||
* 1) Given a word 'x', we can test to see if it contains any 0 bytes
|
||||
* by subtracting 0x01010101, and seeing if any of the high bits of each
|
||||
* byte changed from 0 to 1. This works because the least significant
|
||||
* 0 byte must have had no incoming carry (otherwise it's not the least
|
||||
* significant), so it is 0x00 - 0x01 == 0xff. For all other
|
||||
* byte values, either they have the high bit set initially, or when
|
||||
* 1 is subtracted you get a value in the range 0x00-0x7f, none of which
|
||||
* have their high bit set. The expression here is
|
||||
* (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
|
||||
* there were no 0x00 bytes in the word. You get 0x80 in bytes that
|
||||
* match, but possibly false 0x80 matches in the next more significant
|
||||
* byte to a true match due to carries. For little-endian this is
|
||||
* of no consequence since the least significant match is the one
|
||||
* we're interested in, but big-endian needs method 2 to find which
|
||||
* byte matches.
|
||||
* 2) Given a word 'x', we can test to see _which_ byte was zero by
|
||||
* calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
|
||||
* This produces 0x80 in each byte that was zero, and 0x00 in all
|
||||
* the other bytes. The '| ~0x80808080' clears the low 7 bits in each
|
||||
* byte, and the '| x' part ensures that bytes with the high bit set
|
||||
* produce 0x00. The addition will carry into the high bit of each byte
|
||||
* iff that byte had one of its low 7 bits set. We can then just see
|
||||
* which was the most significant bit set and divide by 8 to find how
|
||||
* many to add to the index.
|
||||
* This is from the book 'The PowerPC Compiler Writer's Guide',
|
||||
* by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
|
||||
*/
|
||||
|
||||
_GLOBAL(strlen)
|
||||
andi. r0, r3, 3
|
||||
lis r7, 0x0101
|
||||
addi r10, r3, -4
|
||||
addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */
|
||||
rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */
|
||||
bne- 3f
|
||||
.balign IFETCH_ALIGN_BYTES
|
||||
1: lwzu r9, 4(r10)
|
||||
2: subf r8, r7, r9
|
||||
and. r8, r8, r6
|
||||
beq+ 1b
|
||||
andc. r8, r8, r9
|
||||
beq+ 1b
|
||||
andc r8, r9, r6
|
||||
orc r9, r9, r6
|
||||
subfe r8, r6, r8
|
||||
nor r8, r8, r9
|
||||
cntlzw r8, r8
|
||||
subf r3, r3, r10
|
||||
srwi r8, r8, 3
|
||||
add r3, r3, r8
|
||||
blr
|
||||
|
||||
/* Missaligned string: make sure bytes before string are seen not 0 */
|
||||
3: xor r10, r10, r0
|
||||
orc r8, r8, r8
|
||||
lwzu r9, 4(r10)
|
||||
slwi r0, r0, 3
|
||||
srw r8, r8, r0
|
||||
orc r9, r9, r8
|
||||
b 2b
|
||||
EXPORT_SYMBOL(strlen)
|
Loading…
Reference in New Issue
Block a user