mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-05 06:56:52 +07:00
789c299ca2
Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user. The loop now does 32 bytes at a time and as well as pairing loads and stores. A quick test case that reads 8kB over and over shows the improvement: POWER6: 53% faster POWER7: 51% faster #define _XOPEN_SOURCE 500 #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #define BUFSIZE (8 * 1024) #define ITERATIONS 10000000 int main() { char tmpfile[] = "/tmp/copy_to_user_testXXXXXX"; int fd; char *buf[BUFSIZE]; unsigned long i; fd = mkstemp(tmpfile); if (fd < 0) { perror("open"); exit(1); } if (write(fd, buf, BUFSIZE) != BUFSIZE) { perror("open"); exit(1); } for (i = 0; i < 10000000; i++) { if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) { perror("pread"); exit(1); } } unlink(tmpfile); return 0; } Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
646 lines
10 KiB
ArmAsm
646 lines
10 KiB
ArmAsm
/*
|
|
* Copyright (C) 2002 Paul Mackerras, IBM Corp.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
#include <asm/processor.h>
|
|
#include <asm/ppc_asm.h>
|
|
|
|
.align 7
|
|
_GLOBAL(__copy_tofrom_user)
|
|
/* first check for a whole page copy on a page boundary */
|
|
cmpldi cr1,r5,16
|
|
cmpdi cr6,r5,4096
|
|
or r0,r3,r4
|
|
neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
|
|
andi. r0,r0,4095
|
|
std r3,-24(r1)
|
|
crand cr0*4+2,cr0*4+2,cr6*4+2
|
|
std r4,-16(r1)
|
|
std r5,-8(r1)
|
|
dcbt 0,r4
|
|
beq .Lcopy_page_4K
|
|
andi. r6,r6,7
|
|
PPC_MTOCRF 0x01,r5
|
|
blt cr1,.Lshort_copy
|
|
/* Below we want to nop out the bne if we're on a CPU that has the
|
|
* CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
|
|
* cleared.
|
|
* At the time of writing the only CPU that has this combination of bits
|
|
* set is Power6.
|
|
*/
|
|
BEGIN_FTR_SECTION
|
|
nop
|
|
FTR_SECTION_ELSE
|
|
bne .Ldst_unaligned
|
|
ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
|
|
CPU_FTR_UNALIGNED_LD_STD)
|
|
.Ldst_aligned:
|
|
addi r3,r3,-16
|
|
BEGIN_FTR_SECTION
|
|
andi. r0,r4,7
|
|
bne .Lsrc_unaligned
|
|
END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
|
blt cr1,.Ldo_tail /* if < 16 bytes to copy */
|
|
srdi r0,r5,5
|
|
cmpdi cr1,r0,0
|
|
20: ld r7,0(r4)
|
|
220: ld r6,8(r4)
|
|
addi r4,r4,16
|
|
mtctr r0
|
|
andi. r0,r5,0x10
|
|
beq 22f
|
|
addi r3,r3,16
|
|
addi r4,r4,-16
|
|
mr r9,r7
|
|
mr r8,r6
|
|
beq cr1,72f
|
|
21: ld r7,16(r4)
|
|
221: ld r6,24(r4)
|
|
addi r4,r4,32
|
|
70: std r9,0(r3)
|
|
270: std r8,8(r3)
|
|
22: ld r9,0(r4)
|
|
222: ld r8,8(r4)
|
|
71: std r7,16(r3)
|
|
271: std r6,24(r3)
|
|
addi r3,r3,32
|
|
bdnz 21b
|
|
72: std r9,0(r3)
|
|
272: std r8,8(r3)
|
|
andi. r5,r5,0xf
|
|
beq+ 3f
|
|
addi r4,r4,16
|
|
.Ldo_tail:
|
|
addi r3,r3,16
|
|
bf cr7*4+0,246f
|
|
244: ld r9,0(r4)
|
|
addi r4,r4,8
|
|
245: std r9,0(r3)
|
|
addi r3,r3,8
|
|
246: bf cr7*4+1,1f
|
|
23: lwz r9,0(r4)
|
|
addi r4,r4,4
|
|
73: stw r9,0(r3)
|
|
addi r3,r3,4
|
|
1: bf cr7*4+2,2f
|
|
44: lhz r9,0(r4)
|
|
addi r4,r4,2
|
|
74: sth r9,0(r3)
|
|
addi r3,r3,2
|
|
2: bf cr7*4+3,3f
|
|
45: lbz r9,0(r4)
|
|
75: stb r9,0(r3)
|
|
3: li r3,0
|
|
blr
|
|
|
|
.Lsrc_unaligned:
|
|
srdi r6,r5,3
|
|
addi r5,r5,-16
|
|
subf r4,r0,r4
|
|
srdi r7,r5,4
|
|
sldi r10,r0,3
|
|
cmpldi cr6,r6,3
|
|
andi. r5,r5,7
|
|
mtctr r7
|
|
subfic r11,r10,64
|
|
add r5,r5,r0
|
|
bt cr7*4+0,28f
|
|
|
|
24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */
|
|
25: ld r0,8(r4)
|
|
sld r6,r9,r10
|
|
26: ldu r9,16(r4)
|
|
srd r7,r0,r11
|
|
sld r8,r0,r10
|
|
or r7,r7,r6
|
|
blt cr6,79f
|
|
27: ld r0,8(r4)
|
|
b 2f
|
|
|
|
28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */
|
|
29: ldu r9,8(r4)
|
|
sld r8,r0,r10
|
|
addi r3,r3,-8
|
|
blt cr6,5f
|
|
30: ld r0,8(r4)
|
|
srd r12,r9,r11
|
|
sld r6,r9,r10
|
|
31: ldu r9,16(r4)
|
|
or r12,r8,r12
|
|
srd r7,r0,r11
|
|
sld r8,r0,r10
|
|
addi r3,r3,16
|
|
beq cr6,78f
|
|
|
|
1: or r7,r7,r6
|
|
32: ld r0,8(r4)
|
|
76: std r12,8(r3)
|
|
2: srd r12,r9,r11
|
|
sld r6,r9,r10
|
|
33: ldu r9,16(r4)
|
|
or r12,r8,r12
|
|
77: stdu r7,16(r3)
|
|
srd r7,r0,r11
|
|
sld r8,r0,r10
|
|
bdnz 1b
|
|
|
|
78: std r12,8(r3)
|
|
or r7,r7,r6
|
|
79: std r7,16(r3)
|
|
5: srd r12,r9,r11
|
|
or r12,r8,r12
|
|
80: std r12,24(r3)
|
|
bne 6f
|
|
li r3,0
|
|
blr
|
|
6: cmpwi cr1,r5,8
|
|
addi r3,r3,32
|
|
sld r9,r9,r10
|
|
ble cr1,7f
|
|
34: ld r0,8(r4)
|
|
srd r7,r0,r11
|
|
or r9,r7,r9
|
|
7:
|
|
bf cr7*4+1,1f
|
|
rotldi r9,r9,32
|
|
94: stw r9,0(r3)
|
|
addi r3,r3,4
|
|
1: bf cr7*4+2,2f
|
|
rotldi r9,r9,16
|
|
95: sth r9,0(r3)
|
|
addi r3,r3,2
|
|
2: bf cr7*4+3,3f
|
|
rotldi r9,r9,8
|
|
96: stb r9,0(r3)
|
|
3: li r3,0
|
|
blr
|
|
|
|
.Ldst_unaligned:
|
|
PPC_MTOCRF 0x01,r6 /* put #bytes to 8B bdry into cr7 */
|
|
subf r5,r6,r5
|
|
li r7,0
|
|
cmpldi cr1,r5,16
|
|
bf cr7*4+3,1f
|
|
35: lbz r0,0(r4)
|
|
81: stb r0,0(r3)
|
|
addi r7,r7,1
|
|
1: bf cr7*4+2,2f
|
|
36: lhzx r0,r7,r4
|
|
82: sthx r0,r7,r3
|
|
addi r7,r7,2
|
|
2: bf cr7*4+1,3f
|
|
37: lwzx r0,r7,r4
|
|
83: stwx r0,r7,r3
|
|
3: PPC_MTOCRF 0x01,r5
|
|
add r4,r6,r4
|
|
add r3,r6,r3
|
|
b .Ldst_aligned
|
|
|
|
.Lshort_copy:
|
|
bf cr7*4+0,1f
|
|
38: lwz r0,0(r4)
|
|
39: lwz r9,4(r4)
|
|
addi r4,r4,8
|
|
84: stw r0,0(r3)
|
|
85: stw r9,4(r3)
|
|
addi r3,r3,8
|
|
1: bf cr7*4+1,2f
|
|
40: lwz r0,0(r4)
|
|
addi r4,r4,4
|
|
86: stw r0,0(r3)
|
|
addi r3,r3,4
|
|
2: bf cr7*4+2,3f
|
|
41: lhz r0,0(r4)
|
|
addi r4,r4,2
|
|
87: sth r0,0(r3)
|
|
addi r3,r3,2
|
|
3: bf cr7*4+3,4f
|
|
42: lbz r0,0(r4)
|
|
88: stb r0,0(r3)
|
|
4: li r3,0
|
|
blr
|
|
|
|
/*
|
|
* exception handlers follow
|
|
* we have to return the number of bytes not copied
|
|
* for an exception on a load, we set the rest of the destination to 0
|
|
*/
|
|
|
|
136:
|
|
137:
|
|
add r3,r3,r7
|
|
b 1f
|
|
130:
|
|
131:
|
|
addi r3,r3,8
|
|
120:
|
|
320:
|
|
122:
|
|
322:
|
|
124:
|
|
125:
|
|
126:
|
|
127:
|
|
128:
|
|
129:
|
|
133:
|
|
addi r3,r3,8
|
|
132:
|
|
addi r3,r3,8
|
|
121:
|
|
321:
|
|
344:
|
|
134:
|
|
135:
|
|
138:
|
|
139:
|
|
140:
|
|
141:
|
|
142:
|
|
123:
|
|
144:
|
|
145:
|
|
|
|
/*
|
|
* here we have had a fault on a load and r3 points to the first
|
|
* unmodified byte of the destination
|
|
*/
|
|
1: ld r6,-24(r1)
|
|
ld r4,-16(r1)
|
|
ld r5,-8(r1)
|
|
subf r6,r6,r3
|
|
add r4,r4,r6
|
|
subf r5,r6,r5 /* #bytes left to go */
|
|
|
|
/*
|
|
* first see if we can copy any more bytes before hitting another exception
|
|
*/
|
|
mtctr r5
|
|
43: lbz r0,0(r4)
|
|
addi r4,r4,1
|
|
89: stb r0,0(r3)
|
|
addi r3,r3,1
|
|
bdnz 43b
|
|
li r3,0 /* huh? all copied successfully this time? */
|
|
blr
|
|
|
|
/*
|
|
* here we have trapped again, need to clear ctr bytes starting at r3
|
|
*/
|
|
143: mfctr r5
|
|
li r0,0
|
|
mr r4,r3
|
|
mr r3,r5 /* return the number of bytes not copied */
|
|
1: andi. r9,r4,7
|
|
beq 3f
|
|
90: stb r0,0(r4)
|
|
addic. r5,r5,-1
|
|
addi r4,r4,1
|
|
bne 1b
|
|
blr
|
|
3: cmpldi cr1,r5,8
|
|
srdi r9,r5,3
|
|
andi. r5,r5,7
|
|
blt cr1,93f
|
|
mtctr r9
|
|
91: std r0,0(r4)
|
|
addi r4,r4,8
|
|
bdnz 91b
|
|
93: beqlr
|
|
mtctr r5
|
|
92: stb r0,0(r4)
|
|
addi r4,r4,1
|
|
bdnz 92b
|
|
blr
|
|
|
|
/*
|
|
* exception handlers for stores: we just need to work
|
|
* out how many bytes weren't copied
|
|
*/
|
|
182:
|
|
183:
|
|
add r3,r3,r7
|
|
b 1f
|
|
371:
|
|
180:
|
|
addi r3,r3,8
|
|
171:
|
|
177:
|
|
addi r3,r3,8
|
|
370:
|
|
372:
|
|
176:
|
|
178:
|
|
addi r3,r3,4
|
|
185:
|
|
addi r3,r3,4
|
|
170:
|
|
172:
|
|
345:
|
|
173:
|
|
174:
|
|
175:
|
|
179:
|
|
181:
|
|
184:
|
|
186:
|
|
187:
|
|
188:
|
|
189:
|
|
194:
|
|
195:
|
|
196:
|
|
1:
|
|
ld r6,-24(r1)
|
|
ld r5,-8(r1)
|
|
add r6,r6,r5
|
|
subf r3,r3,r6 /* #bytes not copied */
|
|
190:
|
|
191:
|
|
192:
|
|
blr /* #bytes not copied in r3 */
|
|
|
|
.section __ex_table,"a"
|
|
.align 3
|
|
.llong 20b,120b
|
|
.llong 220b,320b
|
|
.llong 21b,121b
|
|
.llong 221b,321b
|
|
.llong 70b,170b
|
|
.llong 270b,370b
|
|
.llong 22b,122b
|
|
.llong 222b,322b
|
|
.llong 71b,171b
|
|
.llong 271b,371b
|
|
.llong 72b,172b
|
|
.llong 272b,372b
|
|
.llong 244b,344b
|
|
.llong 245b,345b
|
|
.llong 23b,123b
|
|
.llong 73b,173b
|
|
.llong 44b,144b
|
|
.llong 74b,174b
|
|
.llong 45b,145b
|
|
.llong 75b,175b
|
|
.llong 24b,124b
|
|
.llong 25b,125b
|
|
.llong 26b,126b
|
|
.llong 27b,127b
|
|
.llong 28b,128b
|
|
.llong 29b,129b
|
|
.llong 30b,130b
|
|
.llong 31b,131b
|
|
.llong 32b,132b
|
|
.llong 76b,176b
|
|
.llong 33b,133b
|
|
.llong 77b,177b
|
|
.llong 78b,178b
|
|
.llong 79b,179b
|
|
.llong 80b,180b
|
|
.llong 34b,134b
|
|
.llong 94b,194b
|
|
.llong 95b,195b
|
|
.llong 96b,196b
|
|
.llong 35b,135b
|
|
.llong 81b,181b
|
|
.llong 36b,136b
|
|
.llong 82b,182b
|
|
.llong 37b,137b
|
|
.llong 83b,183b
|
|
.llong 38b,138b
|
|
.llong 39b,139b
|
|
.llong 84b,184b
|
|
.llong 85b,185b
|
|
.llong 40b,140b
|
|
.llong 86b,186b
|
|
.llong 41b,141b
|
|
.llong 87b,187b
|
|
.llong 42b,142b
|
|
.llong 88b,188b
|
|
.llong 43b,143b
|
|
.llong 89b,189b
|
|
.llong 90b,190b
|
|
.llong 91b,191b
|
|
.llong 92b,192b
|
|
|
|
.text
|
|
|
|
/*
|
|
* Routine to copy a whole page of data, optimized for POWER4.
|
|
* On POWER4 it is more than 50% faster than the simple loop
|
|
* above (following the .Ldst_aligned label) but it runs slightly
|
|
* slower on POWER3.
|
|
*/
|
|
.Lcopy_page_4K:
|
|
std r31,-32(1)
|
|
std r30,-40(1)
|
|
std r29,-48(1)
|
|
std r28,-56(1)
|
|
std r27,-64(1)
|
|
std r26,-72(1)
|
|
std r25,-80(1)
|
|
std r24,-88(1)
|
|
std r23,-96(1)
|
|
std r22,-104(1)
|
|
std r21,-112(1)
|
|
std r20,-120(1)
|
|
li r5,4096/32 - 1
|
|
addi r3,r3,-8
|
|
li r0,5
|
|
0: addi r5,r5,-24
|
|
mtctr r0
|
|
20: ld r22,640(4)
|
|
21: ld r21,512(4)
|
|
22: ld r20,384(4)
|
|
23: ld r11,256(4)
|
|
24: ld r9,128(4)
|
|
25: ld r7,0(4)
|
|
26: ld r25,648(4)
|
|
27: ld r24,520(4)
|
|
28: ld r23,392(4)
|
|
29: ld r10,264(4)
|
|
30: ld r8,136(4)
|
|
31: ldu r6,8(4)
|
|
cmpwi r5,24
|
|
1:
|
|
32: std r22,648(3)
|
|
33: std r21,520(3)
|
|
34: std r20,392(3)
|
|
35: std r11,264(3)
|
|
36: std r9,136(3)
|
|
37: std r7,8(3)
|
|
38: ld r28,648(4)
|
|
39: ld r27,520(4)
|
|
40: ld r26,392(4)
|
|
41: ld r31,264(4)
|
|
42: ld r30,136(4)
|
|
43: ld r29,8(4)
|
|
44: std r25,656(3)
|
|
45: std r24,528(3)
|
|
46: std r23,400(3)
|
|
47: std r10,272(3)
|
|
48: std r8,144(3)
|
|
49: std r6,16(3)
|
|
50: ld r22,656(4)
|
|
51: ld r21,528(4)
|
|
52: ld r20,400(4)
|
|
53: ld r11,272(4)
|
|
54: ld r9,144(4)
|
|
55: ld r7,16(4)
|
|
56: std r28,664(3)
|
|
57: std r27,536(3)
|
|
58: std r26,408(3)
|
|
59: std r31,280(3)
|
|
60: std r30,152(3)
|
|
61: stdu r29,24(3)
|
|
62: ld r25,664(4)
|
|
63: ld r24,536(4)
|
|
64: ld r23,408(4)
|
|
65: ld r10,280(4)
|
|
66: ld r8,152(4)
|
|
67: ldu r6,24(4)
|
|
bdnz 1b
|
|
68: std r22,648(3)
|
|
69: std r21,520(3)
|
|
70: std r20,392(3)
|
|
71: std r11,264(3)
|
|
72: std r9,136(3)
|
|
73: std r7,8(3)
|
|
74: addi r4,r4,640
|
|
75: addi r3,r3,648
|
|
bge 0b
|
|
mtctr r5
|
|
76: ld r7,0(4)
|
|
77: ld r8,8(4)
|
|
78: ldu r9,16(4)
|
|
3:
|
|
79: ld r10,8(4)
|
|
80: std r7,8(3)
|
|
81: ld r7,16(4)
|
|
82: std r8,16(3)
|
|
83: ld r8,24(4)
|
|
84: std r9,24(3)
|
|
85: ldu r9,32(4)
|
|
86: stdu r10,32(3)
|
|
bdnz 3b
|
|
4:
|
|
87: ld r10,8(4)
|
|
88: std r7,8(3)
|
|
89: std r8,16(3)
|
|
90: std r9,24(3)
|
|
91: std r10,32(3)
|
|
9: ld r20,-120(1)
|
|
ld r21,-112(1)
|
|
ld r22,-104(1)
|
|
ld r23,-96(1)
|
|
ld r24,-88(1)
|
|
ld r25,-80(1)
|
|
ld r26,-72(1)
|
|
ld r27,-64(1)
|
|
ld r28,-56(1)
|
|
ld r29,-48(1)
|
|
ld r30,-40(1)
|
|
ld r31,-32(1)
|
|
li r3,0
|
|
blr
|
|
|
|
/*
|
|
* on an exception, reset to the beginning and jump back into the
|
|
* standard __copy_tofrom_user
|
|
*/
|
|
100: ld r20,-120(1)
|
|
ld r21,-112(1)
|
|
ld r22,-104(1)
|
|
ld r23,-96(1)
|
|
ld r24,-88(1)
|
|
ld r25,-80(1)
|
|
ld r26,-72(1)
|
|
ld r27,-64(1)
|
|
ld r28,-56(1)
|
|
ld r29,-48(1)
|
|
ld r30,-40(1)
|
|
ld r31,-32(1)
|
|
ld r3,-24(r1)
|
|
ld r4,-16(r1)
|
|
li r5,4096
|
|
b .Ldst_aligned
|
|
|
|
.section __ex_table,"a"
|
|
.align 3
|
|
.llong 20b,100b
|
|
.llong 21b,100b
|
|
.llong 22b,100b
|
|
.llong 23b,100b
|
|
.llong 24b,100b
|
|
.llong 25b,100b
|
|
.llong 26b,100b
|
|
.llong 27b,100b
|
|
.llong 28b,100b
|
|
.llong 29b,100b
|
|
.llong 30b,100b
|
|
.llong 31b,100b
|
|
.llong 32b,100b
|
|
.llong 33b,100b
|
|
.llong 34b,100b
|
|
.llong 35b,100b
|
|
.llong 36b,100b
|
|
.llong 37b,100b
|
|
.llong 38b,100b
|
|
.llong 39b,100b
|
|
.llong 40b,100b
|
|
.llong 41b,100b
|
|
.llong 42b,100b
|
|
.llong 43b,100b
|
|
.llong 44b,100b
|
|
.llong 45b,100b
|
|
.llong 46b,100b
|
|
.llong 47b,100b
|
|
.llong 48b,100b
|
|
.llong 49b,100b
|
|
.llong 50b,100b
|
|
.llong 51b,100b
|
|
.llong 52b,100b
|
|
.llong 53b,100b
|
|
.llong 54b,100b
|
|
.llong 55b,100b
|
|
.llong 56b,100b
|
|
.llong 57b,100b
|
|
.llong 58b,100b
|
|
.llong 59b,100b
|
|
.llong 60b,100b
|
|
.llong 61b,100b
|
|
.llong 62b,100b
|
|
.llong 63b,100b
|
|
.llong 64b,100b
|
|
.llong 65b,100b
|
|
.llong 66b,100b
|
|
.llong 67b,100b
|
|
.llong 68b,100b
|
|
.llong 69b,100b
|
|
.llong 70b,100b
|
|
.llong 71b,100b
|
|
.llong 72b,100b
|
|
.llong 73b,100b
|
|
.llong 74b,100b
|
|
.llong 75b,100b
|
|
.llong 76b,100b
|
|
.llong 77b,100b
|
|
.llong 78b,100b
|
|
.llong 79b,100b
|
|
.llong 80b,100b
|
|
.llong 81b,100b
|
|
.llong 82b,100b
|
|
.llong 83b,100b
|
|
.llong 84b,100b
|
|
.llong 85b,100b
|
|
.llong 86b,100b
|
|
.llong 87b,100b
|
|
.llong 88b,100b
|
|
.llong 89b,100b
|
|
.llong 90b,100b
|
|
.llong 91b,100b
|