linux_dsm_epyc7002/arch/s390/lib/mem.S
Heiko Carstens b4623d4e5b s390: provide memmove implementation
Provide an s390 specific memmove implementation which is faster than
the generic implementation which copies byte-wise.

For non-destructive (as defined by the mvc instruction) memmove
operations the following table compares the old default implementation
versus the new s390 specific implementation:

size     old   new
   1     1ns   8ns
   2     2ns   8ns
   4     4ns   8ns
   8     7ns   8ns
  16    17ns   8ns
  32    35ns   8ns
  64    65ns   9ns
 128   146ns  10ns
 256   298ns  11ns
 512   537ns  11ns
1024  1193ns  19ns
2048  2405ns  36ns

So only for very small sizes the old implementation is faster. For
overlapping memmoves, where the mvc instruction can't be used, the new
implementation is as slow as the old one.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-12-12 12:11:32 +01:00

131 lines
2.3 KiB
ArmAsm

/*
* String handling functions.
*
* Copyright IBM Corp. 2012
*/
#include <linux/linkage.h>
#include <asm/export.h>
/*
* void *memmove(void *dest, const void *src, size_t n)
*/
ENTRY(memmove)
ltgr %r4,%r4
lgr %r1,%r2
bzr %r14
clgr %r2,%r3
jnh .Lmemmove_forward
la %r5,0(%r4,%r3)
clgr %r2,%r5
jl .Lmemmove_reverse
.Lmemmove_forward:
aghi %r4,-1
srlg %r0,%r4,8
ltgr %r0,%r0
jz .Lmemmove_rest
.Lmemmove_loop:
mvc 0(256,%r1),0(%r3)
la %r1,256(%r1)
la %r3,256(%r3)
brctg %r0,.Lmemmove_loop
.Lmemmove_rest:
larl %r5,.Lmemmove_mvc
ex %r4,0(%r5)
br %r14
.Lmemmove_reverse:
aghi %r4,-1
.Lmemmove_reverse_loop:
ic %r0,0(%r4,%r3)
stc %r0,0(%r4,%r1)
brctg %r4,.Lmemmove_reverse_loop
ic %r0,0(%r4,%r3)
stc %r0,0(%r4,%r1)
br %r14
.Lmemmove_mvc:
mvc 0(1,%r1),0(%r3)
EXPORT_SYMBOL(memmove)
/*
* memset implementation
*
* This code corresponds to the C construct below. We do distinguish
* between clearing (c == 0) and setting a memory array (c != 0) simply
* because nearly all memset invocations in the kernel clear memory and
* the xc instruction is preferred in such cases.
*
* void *memset(void *s, int c, size_t n)
* {
* if (likely(c == 0))
* return __builtin_memset(s, 0, n);
* return __builtin_memset(s, c, n);
* }
*/
ENTRY(memset)
ltgr %r4,%r4
bzr %r14
ltgr %r3,%r3
jnz .Lmemset_fill
aghi %r4,-1
srlg %r3,%r4,8
ltgr %r3,%r3
lgr %r1,%r2
jz .Lmemset_clear_rest
.Lmemset_clear_loop:
xc 0(256,%r1),0(%r1)
la %r1,256(%r1)
brctg %r3,.Lmemset_clear_loop
.Lmemset_clear_rest:
larl %r3,.Lmemset_xc
ex %r4,0(%r3)
br %r14
.Lmemset_fill:
stc %r3,0(%r2)
cghi %r4,1
lgr %r1,%r2
ber %r14
aghi %r4,-2
srlg %r3,%r4,8
ltgr %r3,%r3
jz .Lmemset_fill_rest
.Lmemset_fill_loop:
mvc 1(256,%r1),0(%r1)
la %r1,256(%r1)
brctg %r3,.Lmemset_fill_loop
.Lmemset_fill_rest:
larl %r3,.Lmemset_mvc
ex %r4,0(%r3)
br %r14
.Lmemset_xc:
xc 0(1,%r1),0(%r1)
.Lmemset_mvc:
mvc 1(1,%r1),0(%r1)
EXPORT_SYMBOL(memset)
/*
* memcpy implementation
*
* void *memcpy(void *dest, const void *src, size_t n)
*/
ENTRY(memcpy)
ltgr %r4,%r4
bzr %r14
aghi %r4,-1
srlg %r5,%r4,8
ltgr %r5,%r5
lgr %r1,%r2
jnz .Lmemcpy_loop
.Lmemcpy_rest:
larl %r5,.Lmemcpy_mvc
ex %r4,0(%r5)
br %r14
.Lmemcpy_loop:
mvc 0(256,%r1),0(%r3)
la %r1,256(%r1)
la %r3,256(%r3)
brctg %r5,.Lmemcpy_loop
j .Lmemcpy_rest
.Lmemcpy_mvc:
mvc 0(1,%r1),0(%r3)
EXPORT_SYMBOL(memcpy)