mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
f8a15f9766
ARCv2 optimized memcpy uses PREFETCHW instruction for prefetching the next cache line but doesn't ensure that the line is not past the end of the buffer. PRETECHW changes the line ownership and marks it dirty, which can cause data corruption if this area is used for DMA IO. Fix the issue by avoiding the PREFETCHW. This leads to performance degradation but it is OK as we'll introduce new memcpy implementation optimized for unaligned memory access using. We also cut off all PREFETCH instructions at they are quite useless here: * we call PREFETCH right before LOAD instruction call. * we copy 16 or 32 bytes of data (depending on CONFIG_ARC_HAS_LL64) in a main logical loop. so we call PREFETCH 4 times (or 2 times) for each L1 cache line (in case of 64B L1 cache Line which is default case). Obviously this is not optimal. Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com> Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
223 lines
4.4 KiB
ArmAsm
223 lines
4.4 KiB
ArmAsm
/*
|
|
* Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#ifdef __LITTLE_ENDIAN__
|
|
# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; <<
|
|
# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >>
|
|
# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM
|
|
# define MERGE_2(RX,RY,IMM)
|
|
# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF
|
|
# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM
|
|
#else
|
|
# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >>
|
|
# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; <<
|
|
# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; <<
|
|
# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; <<
|
|
# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM
|
|
# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08
|
|
#endif
|
|
|
|
#ifdef CONFIG_ARC_HAS_LL64
|
|
# define LOADX(DST,RX) ldd.ab DST, [RX, 8]
|
|
# define STOREX(SRC,RX) std.ab SRC, [RX, 8]
|
|
# define ZOLSHFT 5
|
|
# define ZOLAND 0x1F
|
|
#else
|
|
# define LOADX(DST,RX) ld.ab DST, [RX, 4]
|
|
# define STOREX(SRC,RX) st.ab SRC, [RX, 4]
|
|
# define ZOLSHFT 4
|
|
# define ZOLAND 0xF
|
|
#endif
|
|
|
|
ENTRY_CFI(memcpy)
|
|
mov.f 0, r2
|
|
;;; if size is zero
|
|
jz.d [blink]
|
|
mov r3, r0 ; don;t clobber ret val
|
|
|
|
;;; if size <= 8
|
|
cmp r2, 8
|
|
bls.d @.Lsmallchunk
|
|
mov.f lp_count, r2
|
|
|
|
and.f r4, r0, 0x03
|
|
rsub lp_count, r4, 4
|
|
lpnz @.Laligndestination
|
|
;; LOOP BEGIN
|
|
ldb.ab r5, [r1,1]
|
|
sub r2, r2, 1
|
|
stb.ab r5, [r3,1]
|
|
.Laligndestination:
|
|
|
|
;;; Check the alignment of the source
|
|
and.f r4, r1, 0x03
|
|
bnz.d @.Lsourceunaligned
|
|
|
|
;;; CASE 0: Both source and destination are 32bit aligned
|
|
;;; Convert len to Dwords, unfold x4
|
|
lsr.f lp_count, r2, ZOLSHFT
|
|
lpnz @.Lcopy32_64bytes
|
|
;; LOOP START
|
|
LOADX (r6, r1)
|
|
LOADX (r8, r1)
|
|
LOADX (r10, r1)
|
|
LOADX (r4, r1)
|
|
STOREX (r6, r3)
|
|
STOREX (r8, r3)
|
|
STOREX (r10, r3)
|
|
STOREX (r4, r3)
|
|
.Lcopy32_64bytes:
|
|
|
|
and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
|
|
.Lsmallchunk:
|
|
lpnz @.Lcopyremainingbytes
|
|
;; LOOP START
|
|
ldb.ab r5, [r1,1]
|
|
stb.ab r5, [r3,1]
|
|
.Lcopyremainingbytes:
|
|
|
|
j [blink]
|
|
;;; END CASE 0
|
|
|
|
.Lsourceunaligned:
|
|
cmp r4, 2
|
|
beq.d @.LunalignedOffby2
|
|
sub r2, r2, 1
|
|
|
|
bhi.d @.LunalignedOffby3
|
|
ldb.ab r5, [r1, 1]
|
|
|
|
;;; CASE 1: The source is unaligned, off by 1
|
|
;; Hence I need to read 1 byte for a 16bit alignment
|
|
;; and 2bytes to reach 32bit alignment
|
|
ldh.ab r6, [r1, 2]
|
|
sub r2, r2, 2
|
|
;; Convert to words, unfold x2
|
|
lsr.f lp_count, r2, 3
|
|
MERGE_1 (r6, r6, 8)
|
|
MERGE_2 (r5, r5, 24)
|
|
or r5, r5, r6
|
|
|
|
;; Both src and dst are aligned
|
|
lpnz @.Lcopy8bytes_1
|
|
;; LOOP START
|
|
ld.ab r6, [r1, 4]
|
|
ld.ab r8, [r1,4]
|
|
|
|
SHIFT_1 (r7, r6, 24)
|
|
or r7, r7, r5
|
|
SHIFT_2 (r5, r6, 8)
|
|
|
|
SHIFT_1 (r9, r8, 24)
|
|
or r9, r9, r5
|
|
SHIFT_2 (r5, r8, 8)
|
|
|
|
st.ab r7, [r3, 4]
|
|
st.ab r9, [r3, 4]
|
|
.Lcopy8bytes_1:
|
|
|
|
;; Write back the remaining 16bits
|
|
EXTRACT_1 (r6, r5, 16)
|
|
sth.ab r6, [r3, 2]
|
|
;; Write back the remaining 8bits
|
|
EXTRACT_2 (r5, r5, 16)
|
|
stb.ab r5, [r3, 1]
|
|
|
|
and.f lp_count, r2, 0x07 ;Last 8bytes
|
|
lpnz @.Lcopybytewise_1
|
|
;; LOOP START
|
|
ldb.ab r6, [r1,1]
|
|
stb.ab r6, [r3,1]
|
|
.Lcopybytewise_1:
|
|
j [blink]
|
|
|
|
.LunalignedOffby2:
|
|
;;; CASE 2: The source is unaligned, off by 2
|
|
ldh.ab r5, [r1, 2]
|
|
sub r2, r2, 1
|
|
|
|
;; Both src and dst are aligned
|
|
;; Convert to words, unfold x2
|
|
lsr.f lp_count, r2, 3
|
|
#ifdef __BIG_ENDIAN__
|
|
asl.nz r5, r5, 16
|
|
#endif
|
|
lpnz @.Lcopy8bytes_2
|
|
;; LOOP START
|
|
ld.ab r6, [r1, 4]
|
|
ld.ab r8, [r1,4]
|
|
|
|
SHIFT_1 (r7, r6, 16)
|
|
or r7, r7, r5
|
|
SHIFT_2 (r5, r6, 16)
|
|
|
|
SHIFT_1 (r9, r8, 16)
|
|
or r9, r9, r5
|
|
SHIFT_2 (r5, r8, 16)
|
|
|
|
st.ab r7, [r3, 4]
|
|
st.ab r9, [r3, 4]
|
|
.Lcopy8bytes_2:
|
|
|
|
#ifdef __BIG_ENDIAN__
|
|
lsr.nz r5, r5, 16
|
|
#endif
|
|
sth.ab r5, [r3, 2]
|
|
|
|
and.f lp_count, r2, 0x07 ;Last 8bytes
|
|
lpnz @.Lcopybytewise_2
|
|
;; LOOP START
|
|
ldb.ab r6, [r1,1]
|
|
stb.ab r6, [r3,1]
|
|
.Lcopybytewise_2:
|
|
j [blink]
|
|
|
|
.LunalignedOffby3:
|
|
;;; CASE 3: The source is unaligned, off by 3
|
|
;;; Hence, I need to read 1byte for achieve the 32bit alignment
|
|
|
|
;; Both src and dst are aligned
|
|
;; Convert to words, unfold x2
|
|
lsr.f lp_count, r2, 3
|
|
#ifdef __BIG_ENDIAN__
|
|
asl.ne r5, r5, 24
|
|
#endif
|
|
lpnz @.Lcopy8bytes_3
|
|
;; LOOP START
|
|
ld.ab r6, [r1, 4]
|
|
ld.ab r8, [r1,4]
|
|
|
|
SHIFT_1 (r7, r6, 8)
|
|
or r7, r7, r5
|
|
SHIFT_2 (r5, r6, 24)
|
|
|
|
SHIFT_1 (r9, r8, 8)
|
|
or r9, r9, r5
|
|
SHIFT_2 (r5, r8, 24)
|
|
|
|
st.ab r7, [r3, 4]
|
|
st.ab r9, [r3, 4]
|
|
.Lcopy8bytes_3:
|
|
|
|
#ifdef __BIG_ENDIAN__
|
|
lsr.nz r5, r5, 24
|
|
#endif
|
|
stb.ab r5, [r3, 1]
|
|
|
|
and.f lp_count, r2, 0x07 ;Last 8bytes
|
|
lpnz @.Lcopybytewise_3
|
|
;; LOOP START
|
|
ldb.ab r6, [r1,1]
|
|
stb.ab r6, [r3,1]
|
|
.Lcopybytewise_3:
|
|
j [blink]
|
|
|
|
END_CFI(memcpy)
|