mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-25 10:59:53 +07:00
58 lines
1.1 KiB
ArmAsm
58 lines
1.1 KiB
ArmAsm
|
/*
|
||
|
* strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu)
|
||
|
*
|
||
|
* Finds length of a 0-terminated string. Optimized for the
|
||
|
* Alpha architecture:
|
||
|
*
|
||
|
* - memory accessed as aligned quadwords only
|
||
|
* - uses bcmpge to compare 8 bytes in parallel
|
||
|
* - does binary search to find 0 byte in last
|
||
|
* quadword (HAKMEM needed 12 instructions to
|
||
|
* do this instead of the 9 instructions that
|
||
|
* binary search needs).
|
||
|
*/
|
||
|
|
||
|
.set noreorder
|
||
|
.set noat
|
||
|
|
||
|
.align 3
|
||
|
|
||
|
.globl strlen
|
||
|
.ent strlen
|
||
|
|
||
|
strlen:
|
||
|
ldq_u $1, 0($16) # load first quadword ($16 may be misaligned)
|
||
|
lda $2, -1($31)
|
||
|
insqh $2, $16, $2
|
||
|
andnot $16, 7, $0
|
||
|
or $2, $1, $1
|
||
|
cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0
|
||
|
bne $2, found
|
||
|
|
||
|
loop: ldq $1, 8($0)
|
||
|
addq $0, 8, $0 # addr += 8
|
||
|
nop # helps dual issue last two insns
|
||
|
cmpbge $31, $1, $2
|
||
|
beq $2, loop
|
||
|
|
||
|
found: blbs $2, done # make aligned case fast
|
||
|
negq $2, $3
|
||
|
and $2, $3, $2
|
||
|
|
||
|
and $2, 0x0f, $1
|
||
|
addq $0, 4, $3
|
||
|
cmoveq $1, $3, $0
|
||
|
|
||
|
and $2, 0x33, $1
|
||
|
addq $0, 2, $3
|
||
|
cmoveq $1, $3, $0
|
||
|
|
||
|
and $2, 0x55, $1
|
||
|
addq $0, 1, $3
|
||
|
cmoveq $1, $3, $0
|
||
|
|
||
|
done: subq $0, $16, $0
|
||
|
ret $31, ($26)
|
||
|
|
||
|
.end strlen
|